diff mbox

[ovs-dev,v3] netdev-dpdk: Implement TCP/UDP TX cksum in ovs-dpdk side

Message ID 20170816133600.11018-1-sysugaozhenyu@gmail.com
State Superseded
Headers show

Commit Message

Gao Zhenyu Aug. 16, 2017, 1:36 p.m. UTC
Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
So L4 packets's cksum were calculated in VM side but performance is not
good.
Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
makes virtio-net frontend-driver support NETIF_F_SG as well

Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
---

Here is some performance number:

Setup:

 qperf client
+---------+
|   VM    |
+---------+
     |
     |                          qperf server
+--------------+              +------------+
| vswitch+dpdk |              | bare-metal |
+--------------+              +------------+
       |                            |
       |                            |
      pNic---------PhysicalSwitch----

do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on' in VM side.
                      It offload cksum job to ovs-dpdk side.

do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM side.
                VM calculate cksum for tcp/udp packets.

We can see huge improvment in TCP throughput if we leverage ovs-dpdk cksum.

[root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01 tcp_bw tcp_lat udp_bw udp_lat
  do cksum in ovs-dpdk          do cksum in VM             without this patch
tcp_bw:
    bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
tcp_bw:
    bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
tcp_bw:
    bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec
tcp_bw:
    bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec
tcp_bw:
    bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec
tcp_bw:
    bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec
tcp_bw:
    bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec
tcp_bw: 
    bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
tcp_bw:
    bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
tcp_bw:
    bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
tcp_bw:
    bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
tcp_bw:
    bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
tcp_bw:
    bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
tcp_bw: 
    bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
tcp_bw:
    bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
tcp_bw:
    bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
tcp_bw:
    bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
tcp_lat:
    latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
tcp_lat:
    latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us
tcp_lat:
    latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us
tcp_lat:
    latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us
tcp_lat:
    latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
tcp_lat:
    latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us
tcp_lat:
    latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
tcp_lat:
    latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us
tcp_lat:
    latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
tcp_lat:
    latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us
tcp_lat:
    latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us
tcp_lat:
    latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us
tcp_lat:
     latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us
tcp_lat:
    latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us
tcp_lat:
    latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us
tcp_lat:
    latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us
tcp_lat:
    latency  =  167 us        latency  =  197 us        latency  =  172 us
udp_bw:
    send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403 KB/sec
    recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec
udp_bw:
    send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810 KB/sec
    recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec
udp_bw:
    send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63 MB/sec
    recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63 MB/sec
udp_bw:
    send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26 MB/sec
    recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82 MB/sec
udp_bw:
    send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45 MB/sec
    recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45 MB/sec
udp_bw:
    send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13 MB/sec
    recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13 MB/sec
udp_bw:
    send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9 MB/sec
    recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7 MB/sec
udp_bw:
    send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52 MB/sec
    recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2 MB/sec
udp_bw:
    send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103 MB/sec
    recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100 MB/sec
udp_bw:
    send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205 MB/sec
    recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202 MB/sec
udp_bw:
    send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401 MB/sec
    recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358 MB/sec
udp_bw:
    send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557 MB/sec
    recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362 MB/sec
udp_bw:
    send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863 MB/sec
    recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584 MB/sec
udp_bw:
    send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08 GB/sec
    recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793 MB/sec
udp_bw: 
    send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19 GB/sec
    recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673 MB/sec
udp_bw:
    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec
    recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762 MB/sec
udp_bw:
    send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec
    recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec
udp_lat:
    latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us
udp_lat:
    latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us
udp_lat:
    latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us
udp_lat:
    latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us
udp_lat:
    latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us
udp_lat:
    latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us
udp_lat:
    latency  =  28.1 us       latency  =  28 us         latency  =  28 us
udp_lat:
     latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us
udp_lat:
    latency  =  31 us         latency  =  31 us         latency  =  30.8 us
udp_lat:
    latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us
udp_lat:
    latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us
udp_lat:
    latency  =  64.9 us       latency  =  65 us         latency  =  65 us
udp_lat:
    latency  =  72.3 us       latency  =  72 us         latency  =  72 us
udp_lat:
    latency  =  121 us        latency  =  122 us        latency  =  122 us
udp_lat: 
    latency  =  0 ns          latency  =  0 ns          latency  =  0 ns

 lib/netdev-dpdk.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 77 insertions(+), 4 deletions(-)

Comments

Ciara Loftus Sept. 1, 2017, 3:17 p.m. UTC | #1
Hi Zhenyu,

Thanks for the v3. No feedback yet on the common implementation, so for now let's focus on this implementation.

Some high level comments:
- Moving the calculation to vhost rx we've removed the impact on non-vhost topology performance.
- The patch needs a rebase.
- checkpatch.py complains about a whitespace error. Please run "utilities/checkpatch.py" on any subsequent patches before you post.

Some comments inline too. I didn't get to verify performance with this review.

Thanks,
Ciara

> Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> So L4 packets's cksum were calculated in VM side but performance is not
> good.
> Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and

"Improves throughput" is a very general statement. Is the statement true for uses cases other than that which you benchmarked?

> makes virtio-net frontend-driver support NETIF_F_SG as well
> 
> Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> ---
> 
> Here is some performance number:
> 
> Setup:
> 
>  qperf client
> +---------+
> |   VM    |
> +---------+
>      |
>      |                          qperf server
> +--------------+              +------------+
> | vswitch+dpdk |              | bare-metal |
> +--------------+              +------------+
>        |                            |
>        |                            |
>       pNic---------PhysicalSwitch----
> 
> do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx on'
> in VM side.
>                       It offload cksum job to ovs-dpdk side.
> 
> do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off' in VM
> side.
>                 VM calculate cksum for tcp/udp packets.
> 
> We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> cksum.
> 
> [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2  host-qperf-server01
> tcp_bw tcp_lat udp_bw udp_lat
>   do cksum in ovs-dpdk          do cksum in VM             without this patch
> tcp_bw:
>     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95 MB/sec
> tcp_bw:
>     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98 MB/sec
> tcp_bw:
>     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19 MB/sec
> tcp_bw:
>     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7 MB/sec
> tcp_bw:
>     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7 MB/sec
> tcp_bw:
>     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9 MB/sec
> tcp_bw:
>     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1 MB/sec
> tcp_bw:
>     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
> tcp_bw:
>     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
> tcp_bw:
>     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
> tcp_bw:
>     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
> tcp_bw:
>     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
> tcp_bw:
>     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
> tcp_bw:
>     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
> tcp_bw:
>     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
> tcp_bw:
>     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
> tcp_bw:
>     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
> tcp_lat:
>     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.3 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.4 us       latency  =  29.5 us
> tcp_lat:
>     latency  =  29 us         latency  =  29.2 us       latency  =  29.6 us
> tcp_lat:
>     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7 us
> tcp_lat:
>     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
> tcp_lat:
>     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2 us
> tcp_lat:
>     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
> tcp_lat:
>     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2 us
> tcp_lat:
>     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5 us
> tcp_lat:
>     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6 us
> tcp_lat:
>      latency  =  47.6 us      latency  =  48 us         latency  =  48.1 us
> tcp_lat:
>     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8 us
> tcp_lat:
>     latency  =  82.8 us       latency  =  82.3 us       latency  =  116 us
> tcp_lat:
>     latency  =  94.8 us       latency  =  94.2 us       latency  =  134 us
> tcp_lat:
>     latency  =  167 us        latency  =  197 us        latency  =  172 us
> udp_bw:
>     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403 KB/sec
>     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400 KB/sec
> udp_bw:
>     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810 KB/sec
>     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807 KB/sec
> udp_bw:
>     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63
> MB/sec
>     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63
> MB/sec
> udp_bw:
>     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26
> MB/sec
>     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82
> MB/sec
> udp_bw:
>     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45
> MB/sec
>     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45
> MB/sec
> udp_bw:
>     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
> MB/sec
>     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13 MB/sec
> udp_bw:
>     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9
> MB/sec
>     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7
> MB/sec
> udp_bw:
>     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52
> MB/sec
>     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2
> MB/sec
> udp_bw:
>     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103
> MB/sec
>     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100 MB/sec
> udp_bw:
>     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205
> MB/sec
>     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202 MB/sec
> udp_bw:
>     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401
> MB/sec
>     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358 MB/sec
> udp_bw:
>     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557
> MB/sec
>     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362 MB/sec
> udp_bw:
>     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863
> MB/sec
>     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584 MB/sec
> udp_bw:
>     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08
> GB/sec
>     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793
> MB/sec
> udp_bw:
>     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19
> GB/sec
>     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673
> MB/sec
> udp_bw:
>     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec
>     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762 MB/sec
> udp_bw:
>     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> bytes/sec
>     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec
> udp_lat:
>     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3 us
> udp_lat:
>     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7 us
> udp_lat:
>     latency  =  27 us         latency  =  26.7 us       latency  =  26.6 us
> udp_lat:
>     latency  =  27 us         latency  =  26.9 us       latency  =  26.7 us
> udp_lat:
>     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3 us
> udp_lat:
>     latency  =  28.1 us       latency  =  28 us         latency  =  28 us
> udp_lat:
>      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2 us
> udp_lat:
>     latency  =  31 us         latency  =  31 us         latency  =  30.8 us
> udp_lat:
>     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3 us
> udp_lat:
>     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5 us
> udp_lat:
>     latency  =  64.9 us       latency  =  65 us         latency  =  65 us
> udp_lat:
>     latency  =  72.3 us       latency  =  72 us         latency  =  72 us
> udp_lat:
>     latency  =  121 us        latency  =  122 us        latency  =  122 us
> udp_lat:
>     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns
> 
>  lib/netdev-dpdk.c | 81
> ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 77 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 1d82bca..cdaf21a 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -31,6 +31,7 @@
>  #include <rte_errno.h>
>  #include <rte_eth_ring.h>
>  #include <rte_ethdev.h>
> +#include <rte_ip.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
> @@ -991,8 +992,7 @@ netdev_dpdk_vhost_construct(struct netdev
> *netdev)
> 
>      err = rte_vhost_driver_disable_features(dev->vhost_id,
>                                  1ULL << VIRTIO_NET_F_HOST_TSO4
> -                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                                | 1ULL << VIRTIO_NET_F_CSUM);
> +                                | 1ULL << VIRTIO_NET_F_HOST_TSO6);
>      if (err) {
>          VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
>                   "port: %s\n", name);
> @@ -1454,6 +1454,78 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> *rxq)
>      rte_free(rx);
>  }
> 
> +static inline void
> +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> +                       uint8_t l4_proto, bool is_ipv4)

I think this function should be renamed netdev_dpdk_vhost_refill_l4_checksum as it is vhost-specific.

> +{
> +    void *l3hdr = (void *)(data + pkt->mbuf.l2_len);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
> +                                         pkt->mbuf.l2_len + pkt->mbuf.l3_len);
> +
> +        tcp_hdr->tcp_csum = 0;
> +        if (is_ipv4) {
> +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> +        } else {
> +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> +        }
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        struct udp_header *udp_hdr = (struct udp_header *)(data +
> +                                         pkt->mbuf.l2_len + pkt->mbuf.l3_len);
> +        /* do not recalculate udp cksum if it was 0 */
> +        if (udp_hdr->udp_csum != 0) {
> +            udp_hdr->udp_csum = 0;
> +            if (is_ipv4) {
> +                /*do not calculate udp cksum if it was a fragment IP*/
> +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> +                                      fragment_offset)) {
> +                    return;
> +                }
> +
> +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
> +            } else {
> +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
> +            }
> +        }
> +    }
> +
> +    pkt->mbuf.ol_flags &= ~PKT_TX_L4_MASK;
> +}
> +
> +static inline void
> +netdev_dpdk_vhost_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> +{
> +    int i;
> +
> +    for (i = 0; i < pkt_cnt; i++) {
> +        ovs_be16 dl_type;
> +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> +        const char *data = dp_packet_data(pkt);
> +        void *l3hdr = (char *)(data + pkt->mbuf.l2_len);
> +
> +        if (!(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need cksum

This is the line checkpatch.py complains about. Please use /* convention for comments.
Use Documentation/internals/contributing/coding-style.rst for reference.

> +            continue;
> +        }
> +
> +        if (OVS_UNLIKELY(pkt->mbuf.l2_len == 0 || pkt->mbuf.l3_len == 0)) {
> +            continue;
> +        }
> +
> +        dl_type = *(ovs_be16 *)(data + pkt->mbuf.l2_len - 2);

The '2' here looks like magic number. Could it be defined somewhere with an intuitive title?

> +        if (dl_type == htons(ETH_TYPE_IP)) {
> +            netdev_refill_l4_cksum(data, pkt,
> +                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,
> +                                   true);
> +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> +            netdev_refill_l4_cksum(data, pkt,
> +                                   ((struct ipv6_hdr *)l3hdr)->proto,
> +                                   false);
> +        }
> +    }
> +}
> +
>  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
>   * 'pkts', even in case of failure.
>   *
> @@ -1644,6 +1716,8 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq
> *rxq,
>      rte_spinlock_unlock(&dev->stats_lock);
> 
>      batch->count = (int) nb_rx;
> +    netdev_dpdk_vhost_tx_csum(batch->packets, batch->count);
> +
>      return 0;
>  }
> 
> @@ -3285,8 +3359,7 @@ netdev_dpdk_vhost_client_reconfigure(struct
> netdev *netdev)
> 
>          err = rte_vhost_driver_disable_features(dev->vhost_id,
>                                      1ULL << VIRTIO_NET_F_HOST_TSO4
> -                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
> -                                    | 1ULL << VIRTIO_NET_F_CSUM);
> +                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6);
>          if (err) {
>              VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
>                       "client port: %s\n", dev->up.name);
> --
> 1.8.3.1
Gao Zhenyu Sept. 2, 2017, 5:06 a.m. UTC | #2
Hi Ciara,

  Thanks for you comments!
  I submit new version here with new testing result:
https://patchwork.ozlabs.org/patch/809042/    :)

Thanks
Zhenyu Gao

2017-09-01 23:17 GMT+08:00 Loftus, Ciara <ciara.loftus@intel.com>:

> Hi Zhenyu,
>
> Thanks for the v3. No feedback yet on the common implementation, so for
> now let's focus on this implementation.
>
> Some high level comments:
> - Moving the calculation to vhost rx we've removed the impact on non-vhost
> topology performance.
> - The patch needs a rebase.
> - checkpatch.py complains about a whitespace error. Please run
> "utilities/checkpatch.py" on any subsequent patches before you post.
>
> Some comments inline too. I didn't get to verify performance with this
> review.
>
> Thanks,
> Ciara
>
> > Currently, the dpdk-vhost side in ovs doesn't support tcp/udp tx cksum.
> > So L4 packets's cksum were calculated in VM side but performance is not
> > good.
> > Implementing tcp/udp tx cksum in ovs-dpdk side improves throughput and
>
> "Improves throughput" is a very general statement. Is the statement true
> for uses cases other than that which you benchmarked?
>
> > makes virtio-net frontend-driver support NETIF_F_SG as well
> >
> > Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> > ---
> >
> > Here is some performance number:
> >
> > Setup:
> >
> >  qperf client
> > +---------+
> > |   VM    |
> > +---------+
> >      |
> >      |                          qperf server
> > +--------------+              +------------+
> > | vswitch+dpdk |              | bare-metal |
> > +--------------+              +------------+
> >        |                            |
> >        |                            |
> >       pNic---------PhysicalSwitch----
> >
> > do cksum in ovs-dpdk: Applied this patch and execute 'ethtool -K eth0 tx
> on'
> > in VM side.
> >                       It offload cksum job to ovs-dpdk side.
> >
> > do cksum in VM: Applied this patch and execute 'ethtool -K eth0 tx off'
> in VM
> > side.
> >                 VM calculate cksum for tcp/udp packets.
> >
> > We can see huge improvment in TCP throughput if we leverage ovs-dpdk
> > cksum.
> >
> > [root@localhost ~]# qperf -t 10 -oo msg_size:1:64K:*2
> host-qperf-server01
> > tcp_bw tcp_lat udp_bw udp_lat
> >   do cksum in ovs-dpdk          do cksum in VM             without this
> patch
> > tcp_bw:
> >     bw  =  2.05 MB/sec        bw  =  1.92 MB/sec        bw  =  1.95
> MB/sec
> > tcp_bw:
> >     bw  =  3.9 MB/sec         bw  =  3.99 MB/sec        bw  =  3.98
> MB/sec
> > tcp_bw:
> >     bw  =  8.09 MB/sec        bw  =  7.82 MB/sec        bw  =  8.19
> MB/sec
> > tcp_bw:
> >     bw  =  14.9 MB/sec        bw  =  14.8 MB/sec        bw  =  15.7
> MB/sec
> > tcp_bw:
> >     bw  =  27.7 MB/sec        bw  =  28 MB/sec          bw  =  29.7
> MB/sec
> > tcp_bw:
> >     bw  =  51.2 MB/sec        bw  =  50.9 MB/sec        bw  =  54.9
> MB/sec
> > tcp_bw:
> >     bw  =  86.7 MB/sec        bw  =  86.8 MB/sec        bw  =  95.1
> MB/sec
> > tcp_bw:
> >     bw  =  149 MB/sec         bw  =  160 MB/sec         bw  =  149 MB/sec
> > tcp_bw:
> >     bw  =  211 MB/sec         bw  =  205 MB/sec         bw  =  216 MB/sec
> > tcp_bw:
> >     bw  =  271 MB/sec         bw  =  254 MB/sec         bw  =  275 MB/sec
> > tcp_bw:
> >     bw  =  326 MB/sec         bw  =  303 MB/sec         bw  =  321 MB/sec
> > tcp_bw:
> >     bw  =  407 MB/sec         bw  =  359 MB/sec         bw  =  361 MB/sec
> > tcp_bw:
> >     bw  =  816 MB/sec         bw  =  512 MB/sec         bw  =  419 MB/sec
> > tcp_bw:
> >     bw  =  840 MB/sec         bw  =  756 MB/sec         bw  =  457 MB/sec
> > tcp_bw:
> >     bw  =  1.07 GB/sec        bw  =  880 MB/sec         bw  =  480 MB/sec
> > tcp_bw:
> >     bw  =  1.17 GB/sec        bw  =  1.01 GB/sec        bw  =  488 MB/sec
> > tcp_bw:
> >     bw  =  1.17 GB/sec        bw  =  1.11 GB/sec        bw  =  483 MB/sec
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  28.9 us       latency  =  29.3 us       latency  =  29.5
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.3 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.4 us       latency  =  29.5
> us
> > tcp_lat:
> >     latency  =  29 us         latency  =  29.2 us       latency  =  29.6
> us
> > tcp_lat:
> >     latency  =  29.1 us       latency  =  29.3 us       latency  =  29.7
> us
> > tcp_lat:
> >     latency  =  29.4 us       latency  =  29.6 us       latency  =  30 us
> > tcp_lat:
> >     latency  =  29.8 us       latency  =  30.1 us       latency  =  30.2
> us
> > tcp_lat:
> >     latency  =  30.9 us       latency  =  30.9 us       latency  =  31 us
> > tcp_lat:
> >     latency  =  46.9 us       latency  =  46.2 us       latency  =  32.2
> us
> > tcp_lat:
> >     latency  =  51.5 us       latency  =  52.6 us       latency  =  34.5
> us
> > tcp_lat:
> >     latency  =  43.9 us       latency  =  43.8 us       latency  =  43.6
> us
> > tcp_lat:
> >      latency  =  47.6 us      latency  =  48 us         latency  =  48.1
> us
> > tcp_lat:
> >     latency  =  77.7 us       latency  =  78.8 us       latency  =  78.8
> us
> > tcp_lat:
> >     latency  =  82.8 us       latency  =  82.3 us       latency  =  116
> us
> > tcp_lat:
> >     latency  =  94.8 us       latency  =  94.2 us       latency  =  134
> us
> > tcp_lat:
> >     latency  =  167 us        latency  =  197 us        latency  =  172
> us
> > udp_bw:
> >     send_bw  =  418 KB/sec    send_bw  =  413 KB/sec    send_bw  =  403
> KB/sec
> >     recv_bw  =  410 KB/sec    recv_bw  =  412 KB/sec    recv_bw  =  400
> KB/sec
> > udp_bw:
> >     send_bw  =  831 KB/sec    send_bw  =  825 KB/sec    send_bw  =  810
> KB/sec
> >     recv_bw  =  828 KB/sec    recv_bw  =  816 KB/sec    recv_bw  =  807
> KB/sec
> > udp_bw:
> >     send_bw  =  1.67 MB/sec   send_bw  =  1.65 MB/sec   send_bw  =  1.63
> > MB/sec
> >     recv_bw  =  1.64 MB/sec   recv_bw  =  1.62 MB/sec   recv_bw  =  1.63
> > MB/sec
> > udp_bw:
> >     send_bw  =  3.36 MB/sec   send_bw  =  3.29 MB/sec   send_bw  =  3.26
> > MB/sec
> >     recv_bw  =  3.29 MB/sec   recv_bw  =  3.25 MB/sec   recv_bw  =  2.82
> > MB/sec
> > udp_bw:
> >     send_bw  =  6.72 MB/sec   send_bw  =  6.61 MB/sec   send_bw  =  6.45
> > MB/sec
> >     recv_bw  =  6.54 MB/sec   recv_bw  =  6.59 MB/sec   recv_bw  =  6.45
> > MB/sec
> > udp_bw:
> >     send_bw  =  13.4 MB/sec   send_bw  =  13.2 MB/sec   send_bw  =  13
> > MB/sec
> >     recv_bw  =  13.1 MB/sec   recv_bw  =  13.1 MB/sec   recv_bw  =  13
> MB/sec
> > udp_bw:
> >     send_bw  =  26.8 MB/sec   send_bw  =  26.4 MB/sec   send_bw  =  25.9
> > MB/sec
> >     recv_bw  =  26.4 MB/sec   recv_bw  =  26.2 MB/sec   recv_bw  =  25.7
> > MB/sec
> > udp_bw:
> >     send_bw  =  53.4 MB/sec   send_bw  =  52.5 MB/sec   send_bw  =    52
> > MB/sec
> >     recv_bw  =  48.4 MB/sec   recv_bw  =  51.8 MB/sec   recv_bw  =  51.2
> > MB/sec
> > udp_bw:
> >     send_bw  =   106 MB/sec   send_bw  =   104 MB/sec   send_bw  =  103
> > MB/sec
> >     recv_bw  =  98.9 MB/sec   recv_bw  =  93.2 MB/sec   recv_bw  =  100
> MB/sec
> > udp_bw:
> >     send_bw  =  213 MB/sec    send_bw  =  206 MB/sec    send_bw  =  205
> > MB/sec
> >     recv_bw  =  197 MB/sec    recv_bw  =  196 MB/sec    recv_bw  =  202
> MB/sec
> > udp_bw:
> >     send_bw  =  417 MB/sec    send_bw  =  405 MB/sec    send_bw  =  401
> > MB/sec
> >     recv_bw  =  400 MB/sec    recv_bw  =  333 MB/sec    recv_bw  =  358
> MB/sec
> > udp_bw:
> >     send_bw  =  556 MB/sec    send_bw  =  552 MB/sec    send_bw  =  557
> > MB/sec
> >     recv_bw  =  361 MB/sec    recv_bw  =  365 MB/sec    recv_bw  =  362
> MB/sec
> > udp_bw:
> >     send_bw  =  865 MB/sec    send_bw  =  866 MB/sec    send_bw  =  863
> > MB/sec
> >     recv_bw  =  564 MB/sec    recv_bw  =  573 MB/sec    recv_bw  =  584
> MB/sec
> > udp_bw:
> >     send_bw  =  1.05 GB/sec   send_bw  =  1.09 GB/sec   send_bw  =  1.08
> > GB/sec
> >     recv_bw  =   789 MB/sec   recv_bw  =   732 MB/sec   recv_bw  =   793
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.18 GB/sec   send_bw  =  1.23 GB/sec   send_bw  =  1.19
> > GB/sec
> >     recv_bw  =   658 MB/sec   recv_bw  =   788 MB/sec   recv_bw  =   673
> > MB/sec
> > udp_bw:
> >     send_bw  =  1.3 GB/sec    send_bw  =  1.3 GB/sec    send_bw  =  1.3
> GB/sec
> >     recv_bw  =  659 MB/sec    recv_bw  =  763 MB/sec    recv_bw  =  762
> MB/sec
> > udp_bw:
> >     send_bw  =  0 bytes/sec   send_bw  =  0 bytes/sec   send_bw  =  0
> > bytes/sec
> >     recv_bw  =  0 bytes/sec   recv_bw  =  0 bytes/sec   recv_bw  =  0
> bytes/sec
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.4
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.5 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.6 us       latency  =  26.3
> us
> > udp_lat:
> >     latency  =  26.7 us       latency  =  26.7 us       latency  =  26.7
> us
> > udp_lat:
> >     latency  =  27 us         latency  =  26.7 us       latency  =  26.6
> us
> > udp_lat:
> >     latency  =  27 us         latency  =  26.9 us       latency  =  26.7
> us
> > udp_lat:
> >     latency  =  27.6 us       latency  =  27.4 us       latency  =  27.3
> us
> > udp_lat:
> >     latency  =  28.1 us       latency  =  28 us         latency  =  28 us
> > udp_lat:
> >      latency  =  29.4 us      latency  =  29.2 us       latency  =  29.2
> us
> > udp_lat:
> >     latency  =  31 us         latency  =  31 us         latency  =  30.8
> us
> > udp_lat:
> >     latency  =  41.4 us       latency  =  41.4 us       latency  =  41.3
> us
> > udp_lat:
> >     latency  =  41.6 us       latency  =  41.5 us       latency  =  41.5
> us
> > udp_lat:
> >     latency  =  64.9 us       latency  =  65 us         latency  =  65 us
> > udp_lat:
> >     latency  =  72.3 us       latency  =  72 us         latency  =  72 us
> > udp_lat:
> >     latency  =  121 us        latency  =  122 us        latency  =  122
> us
> > udp_lat:
> >     latency  =  0 ns          latency  =  0 ns          latency  =  0 ns
> >
> >  lib/netdev-dpdk.c | 81
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 77 insertions(+), 4 deletions(-)
> >
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> > index 1d82bca..cdaf21a 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -31,6 +31,7 @@
> >  #include <rte_errno.h>
> >  #include <rte_eth_ring.h>
> >  #include <rte_ethdev.h>
> > +#include <rte_ip.h>
> >  #include <rte_malloc.h>
> >  #include <rte_mbuf.h>
> >  #include <rte_meter.h>
> > @@ -991,8 +992,7 @@ netdev_dpdk_vhost_construct(struct netdev
> > *netdev)
> >
> >      err = rte_vhost_driver_disable_features(dev->vhost_id,
> >                                  1ULL << VIRTIO_NET_F_HOST_TSO4
> > -                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > -                                | 1ULL << VIRTIO_NET_F_CSUM);
> > +                                | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> >      if (err) {
> >          VLOG_ERR("rte_vhost_driver_disable_features failed for vhost
> user "
> >                   "port: %s\n", name);
> > @@ -1454,6 +1454,78 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq
> > *rxq)
> >      rte_free(rx);
> >  }
> >
> > +static inline void
> > +netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
> > +                       uint8_t l4_proto, bool is_ipv4)
>
> I think this function should be renamed netdev_dpdk_vhost_refill_l4_checksum
> as it is vhost-specific.
>
> > +{
> > +    void *l3hdr = (void *)(data + pkt->mbuf.l2_len);
> > +
> > +    if (l4_proto == IPPROTO_TCP) {
> > +        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
> > +                                         pkt->mbuf.l2_len +
> pkt->mbuf.l3_len);
> > +
> > +        tcp_hdr->tcp_csum = 0;
> > +        if (is_ipv4) {
> > +            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
> > +        } else {
> > +            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
> > +        }
> > +    } else if (l4_proto == IPPROTO_UDP) {
> > +        struct udp_header *udp_hdr = (struct udp_header *)(data +
> > +                                         pkt->mbuf.l2_len +
> pkt->mbuf.l3_len);
> > +        /* do not recalculate udp cksum if it was 0 */
> > +        if (udp_hdr->udp_csum != 0) {
> > +            udp_hdr->udp_csum = 0;
> > +            if (is_ipv4) {
> > +                /*do not calculate udp cksum if it was a fragment IP*/
> > +                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
> > +                                      fragment_offset)) {
> > +                    return;
> > +                }
> > +
> > +                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr,
> udp_hdr);
> > +            } else {
> > +                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr,
> udp_hdr);
> > +            }
> > +        }
> > +    }
> > +
> > +    pkt->mbuf.ol_flags &= ~PKT_TX_L4_MASK;
> > +}
> > +
> > +static inline void
> > +netdev_dpdk_vhost_tx_csum(struct dp_packet **pkts, int pkt_cnt)
> > +{
> > +    int i;
> > +
> > +    for (i = 0; i < pkt_cnt; i++) {
> > +        ovs_be16 dl_type;
> > +        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
> > +        const char *data = dp_packet_data(pkt);
> > +        void *l3hdr = (char *)(data + pkt->mbuf.l2_len);
> > +
> > +        if (!(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
> > +            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need
> cksum
>
> This is the line checkpatch.py complains about. Please use /* convention
> for comments.
> Use Documentation/internals/contributing/coding-style.rst for reference.
>
> > +            continue;
> > +        }
> > +
> > +        if (OVS_UNLIKELY(pkt->mbuf.l2_len == 0 || pkt->mbuf.l3_len ==
> 0)) {
> > +            continue;
> > +        }
> > +
> > +        dl_type = *(ovs_be16 *)(data + pkt->mbuf.l2_len - 2);
>
> The '2' here looks like magic number. Could it be defined somewhere with
> an intuitive title?
>
> > +        if (dl_type == htons(ETH_TYPE_IP)) {
> > +            netdev_refill_l4_cksum(data, pkt,
> > +                                   ((struct ipv4_hdr
> *)l3hdr)->next_proto_id,
> > +                                   true);
> > +        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> > +            netdev_refill_l4_cksum(data, pkt,
> > +                                   ((struct ipv6_hdr *)l3hdr)->proto,
> > +                                   false);
> > +        }
> > +    }
> > +}
> > +
> >  /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> ownership of
> >   * 'pkts', even in case of failure.
> >   *
> > @@ -1644,6 +1716,8 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq
> > *rxq,
> >      rte_spinlock_unlock(&dev->stats_lock);
> >
> >      batch->count = (int) nb_rx;
> > +    netdev_dpdk_vhost_tx_csum(batch->packets, batch->count);
> > +
> >      return 0;
> >  }
> >
> > @@ -3285,8 +3359,7 @@ netdev_dpdk_vhost_client_reconfigure(struct
> > netdev *netdev)
> >
> >          err = rte_vhost_driver_disable_features(dev->vhost_id,
> >                                      1ULL << VIRTIO_NET_F_HOST_TSO4
> > -                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
> > -                                    | 1ULL << VIRTIO_NET_F_CSUM);
> > +                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6);
> >          if (err) {
> >              VLOG_ERR("rte_vhost_driver_disable_features failed for
> vhost user "
> >                       "client port: %s\n", dev->up.name);
> > --
> > 1.8.3.1
>
>
diff mbox

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1d82bca..cdaf21a 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -31,6 +31,7 @@ 
 #include <rte_errno.h>
 #include <rte_eth_ring.h>
 #include <rte_ethdev.h>
+#include <rte_ip.h>
 #include <rte_malloc.h>
 #include <rte_mbuf.h>
 #include <rte_meter.h>
@@ -991,8 +992,7 @@  netdev_dpdk_vhost_construct(struct netdev *netdev)
 
     err = rte_vhost_driver_disable_features(dev->vhost_id,
                                 1ULL << VIRTIO_NET_F_HOST_TSO4
-                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                | 1ULL << VIRTIO_NET_F_CSUM);
+                                | 1ULL << VIRTIO_NET_F_HOST_TSO6);
     if (err) {
         VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
                  "port: %s\n", name);
@@ -1454,6 +1454,78 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+static inline void
+netdev_refill_l4_cksum(const char *data, struct dp_packet *pkt,
+                       uint8_t l4_proto, bool is_ipv4)
+{
+    void *l3hdr = (void *)(data + pkt->mbuf.l2_len);
+
+    if (l4_proto == IPPROTO_TCP) {
+        struct tcp_header *tcp_hdr = (struct tcp_header *)(data +
+                                         pkt->mbuf.l2_len + pkt->mbuf.l3_len);
+
+        tcp_hdr->tcp_csum = 0;
+        if (is_ipv4) {
+            tcp_hdr->tcp_csum = rte_ipv4_udptcp_cksum(l3hdr, tcp_hdr);
+        } else {
+            tcp_hdr->tcp_csum = rte_ipv6_udptcp_cksum(l3hdr, tcp_hdr);
+        }
+    } else if (l4_proto == IPPROTO_UDP) {
+        struct udp_header *udp_hdr = (struct udp_header *)(data +
+                                         pkt->mbuf.l2_len + pkt->mbuf.l3_len);
+        /* do not recalculate udp cksum if it was 0 */
+        if (udp_hdr->udp_csum != 0) {
+            udp_hdr->udp_csum = 0;
+            if (is_ipv4) {
+                /*do not calculate udp cksum if it was a fragment IP*/
+                if (IP_IS_FRAGMENT(((struct ipv4_hdr *)l3hdr)->
+                                      fragment_offset)) {
+                    return;
+                }
+
+                udp_hdr->udp_csum = rte_ipv4_udptcp_cksum(l3hdr, udp_hdr);
+            } else {
+                udp_hdr->udp_csum = rte_ipv6_udptcp_cksum(l3hdr, udp_hdr);
+            }
+        }
+    }
+
+    pkt->mbuf.ol_flags &= ~PKT_TX_L4_MASK;
+}
+
+static inline void
+netdev_dpdk_vhost_tx_csum(struct dp_packet **pkts, int pkt_cnt)
+{
+    int i;
+
+    for (i = 0; i < pkt_cnt; i++) {
+        ovs_be16 dl_type;
+        struct dp_packet *pkt = (struct dp_packet *)pkts[i];
+        const char *data = dp_packet_data(pkt);
+        void *l3hdr = (char *)(data + pkt->mbuf.l2_len);
+
+        if (!(pkt->mbuf.ol_flags & PKT_TX_L4_MASK)) {
+            // DPDK vhost-user tags PKT_TX_L4_MASK if a L4 packet need cksum
+            continue;
+        }
+
+        if (OVS_UNLIKELY(pkt->mbuf.l2_len == 0 || pkt->mbuf.l3_len == 0)) {
+            continue;
+        }
+
+        dl_type = *(ovs_be16 *)(data + pkt->mbuf.l2_len - 2);
+        if (dl_type == htons(ETH_TYPE_IP)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv4_hdr *)l3hdr)->next_proto_id,
+                                   true);
+        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+            netdev_refill_l4_cksum(data, pkt,
+                                   ((struct ipv6_hdr *)l3hdr)->proto,
+                                   false);
+        }
+    }
+}
+
 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
  * 'pkts', even in case of failure.
  *
@@ -1644,6 +1716,8 @@  netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
     rte_spinlock_unlock(&dev->stats_lock);
 
     batch->count = (int) nb_rx;
+    netdev_dpdk_vhost_tx_csum(batch->packets, batch->count);
+
     return 0;
 }
 
@@ -3285,8 +3359,7 @@  netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
 
         err = rte_vhost_driver_disable_features(dev->vhost_id,
                                     1ULL << VIRTIO_NET_F_HOST_TSO4
-                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                    | 1ULL << VIRTIO_NET_F_CSUM);
+                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6);
         if (err) {
             VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
                      "client port: %s\n", dev->up.name);