diff mbox

[ovs-dev,RFC] netdev-dpdk: add support for TSO

Message ID 1491929217-209978-1-git-send-email-mark.b.kavanagh@intel.com
State RFC
Delegated to: Darrell Ball
Headers show

Commit Message

Mark Kavanagh April 11, 2017, 4:46 p.m. UTC
TCP Segmentation Offload (TSO) is a feature which enables
the TCP/IP network stack to delegate segmentation of a TCP
segment to the NIC, thus saving compute resources.

This commit adds support for TSO in the DPDK vHost-User backend,
to OvS v2.6.1; this enables a guest to offload segmentation of
TCP segments that it sends to OvS.

This patch is not intended for upstreaming, but rather was produced
in response to requests for an updated version of the initial TSO RFC
patch posted here:
https://mail.openvswitch.org/pipermail/ovs-dev/2016-June/316414.html

Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
---
 INSTALL.DPDK-ADVANCED.md | 104 ++++++++++++++++++++++++++++++++++++++++++++++-
 lib/dp-packet.h          |  21 +++++-----
 lib/netdev-dpdk.c        |  79 +++++++++++++++++++++++++++--------
 lib/packets.c            |  14 +++++++
 4 files changed, 189 insertions(+), 29 deletions(-)

Comments

Aaron Conole April 12, 2017, 5:38 p.m. UTC | #1
Hi Mark,

Mark Kavanagh <mark.b.kavanagh@intel.com> writes:

> TCP Segmentation Offload (TSO) is a feature which enables
> the TCP/IP network stack to delegate segmentation of a TCP
> segment to the NIC, thus saving compute resources.
>
> This commit adds support for TSO in the DPDK vHost-User backend,
> to OvS v2.6.1; this enables a guest to offload segmentation of
> TCP segments that it sends to OvS.
>
> This patch is not intended for upstreaming, but rather was produced
> in response to requests for an updated version of the initial TSO RFC
> patch posted here:
> https://mail.openvswitch.org/pipermail/ovs-dev/2016-June/316414.html
>
> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> ---

...

> diff --git a/lib/packets.c b/lib/packets.c
> index e4c29d5..2417ba2 100644
> --- a/lib/packets.c
> +++ b/lib/packets.c
> @@ -33,6 +33,10 @@
>  #include "dp-packet.h"
>  #include "unaligned.h"
>  
> +#ifdef DPDK_NETDEV
> +#include "rte_ether.h"
> +#endif
> +
>  const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
>  const struct in6_addr in6addr_all_hosts = IN6ADDR_ALL_HOSTS_INIT;
>  
> @@ -204,6 +208,11 @@ eth_push_vlan(struct dp_packet *packet, ovs_be16 tpid, ovs_be16 tci)
>      memmove(veh, (char *)veh + VLAN_HEADER_LEN, 2 * ETH_ADDR_LEN);
>      veh->veth_type = tpid;
>      veh->veth_tci = tci & htons(~VLAN_CFI);
> +
> +#ifdef DPDK_NETDEV
> +    struct rte_mbuf *pkt = &(packet->mbuf);
> +    pkt->l2_len += sizeof(struct vlan_hdr);
> +#endif
>  }
>  
>  /* Removes outermost VLAN header (if any is present) from 'packet'.
> @@ -221,6 +230,11 @@ eth_pop_vlan(struct dp_packet *packet)
>          memmove((char *)veh + VLAN_HEADER_LEN, veh, 2 * ETH_ADDR_LEN);
>          dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
>      }
> +
> +#ifdef DPDK_NETDEV
> +    struct rte_mbuf *pkt = &(packet->mbuf);
> +    pkt->l2_len -= sizeof(struct vlan_hdr);
> +#endif
>  }
>  
>  /* Set ethertype of the packet. */

Would it be better to change the dp_packet_resize_l2 call?  Are you
worried about the mpls case?
Mark Kavanagh April 13, 2017, 8:58 a.m. UTC | #2
>Hi Mark,
>
>Mark Kavanagh <mark.b.kavanagh@intel.com> writes:
>
>> TCP Segmentation Offload (TSO) is a feature which enables
>> the TCP/IP network stack to delegate segmentation of a TCP
>> segment to the NIC, thus saving compute resources.
>>
>> This commit adds support for TSO in the DPDK vHost-User backend,
>> to OvS v2.6.1; this enables a guest to offload segmentation of
>> TCP segments that it sends to OvS.
>>
>> This patch is not intended for upstreaming, but rather was produced
>> in response to requests for an updated version of the initial TSO RFC
>> patch posted here:
>> https://mail.openvswitch.org/pipermail/ovs-dev/2016-June/316414.html
>>
>> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>> ---
>
>...
>
>> diff --git a/lib/packets.c b/lib/packets.c
>> index e4c29d5..2417ba2 100644
>> --- a/lib/packets.c
>> +++ b/lib/packets.c
>> @@ -33,6 +33,10 @@
>>  #include "dp-packet.h"
>>  #include "unaligned.h"
>>
>> +#ifdef DPDK_NETDEV
>> +#include "rte_ether.h"
>> +#endif
>> +
>>  const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
>>  const struct in6_addr in6addr_all_hosts = IN6ADDR_ALL_HOSTS_INIT;
>>
>> @@ -204,6 +208,11 @@ eth_push_vlan(struct dp_packet *packet, ovs_be16 tpid, ovs_be16 tci)
>>      memmove(veh, (char *)veh + VLAN_HEADER_LEN, 2 * ETH_ADDR_LEN);
>>      veh->veth_type = tpid;
>>      veh->veth_tci = tci & htons(~VLAN_CFI);
>> +
>> +#ifdef DPDK_NETDEV
>> +    struct rte_mbuf *pkt = &(packet->mbuf);
>> +    pkt->l2_len += sizeof(struct vlan_hdr);
>> +#endif
>>  }
>>
>>  /* Removes outermost VLAN header (if any is present) from 'packet'.
>> @@ -221,6 +230,11 @@ eth_pop_vlan(struct dp_packet *packet)
>>          memmove((char *)veh + VLAN_HEADER_LEN, veh, 2 * ETH_ADDR_LEN);
>>          dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
>>      }
>> +
>> +#ifdef DPDK_NETDEV
>> +    struct rte_mbuf *pkt = &(packet->mbuf);
>> +    pkt->l2_len -= sizeof(struct vlan_hdr);
>> +#endif
>>  }
>>
>>  /* Set ethertype of the packet. */
>
>Would it be better to change the dp_packet_resize_l2 call?  Are you

Hey Aaron,

Good call - that would definitely be a more suitable location.

>worried about the mpls case?

I haven't considered mpls here at all, as it wasn't part of the use case for which this new version of the patch was produced I'm afraid.
Out of curiosity, what is your concern here?

Thanks in advance,
Mark
Aaron Conole April 13, 2017, 1:54 p.m. UTC | #3
"Kavanagh, Mark B" <mark.b.kavanagh@intel.com> writes:

>>Hi Mark,
>>
>>Mark Kavanagh <mark.b.kavanagh@intel.com> writes:
>>
>>> TCP Segmentation Offload (TSO) is a feature which enables
>>> the TCP/IP network stack to delegate segmentation of a TCP
>>> segment to the NIC, thus saving compute resources.
>>>
>>> This commit adds support for TSO in the DPDK vHost-User backend,
>>> to OvS v2.6.1; this enables a guest to offload segmentation of
>>> TCP segments that it sends to OvS.
>>>
>>> This patch is not intended for upstreaming, but rather was produced
>>> in response to requests for an updated version of the initial TSO RFC
>>> patch posted here:
>>> https://mail.openvswitch.org/pipermail/ovs-dev/2016-June/316414.html
>>>
>>> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>>> ---
>>
>>...
>>
>>> diff --git a/lib/packets.c b/lib/packets.c
>>> index e4c29d5..2417ba2 100644
>>> --- a/lib/packets.c
>>> +++ b/lib/packets.c
>>> @@ -33,6 +33,10 @@
>>>  #include "dp-packet.h"
>>>  #include "unaligned.h"
>>>
>>> +#ifdef DPDK_NETDEV
>>> +#include "rte_ether.h"
>>> +#endif
>>> +
>>>  const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
>>>  const struct in6_addr in6addr_all_hosts = IN6ADDR_ALL_HOSTS_INIT;
>>>
>>> @@ -204,6 +208,11 @@ eth_push_vlan(struct dp_packet *packet, ovs_be16 tpid, ovs_be16 tci)
>>>      memmove(veh, (char *)veh + VLAN_HEADER_LEN, 2 * ETH_ADDR_LEN);
>>>      veh->veth_type = tpid;
>>>      veh->veth_tci = tci & htons(~VLAN_CFI);
>>> +
>>> +#ifdef DPDK_NETDEV
>>> +    struct rte_mbuf *pkt = &(packet->mbuf);
>>> +    pkt->l2_len += sizeof(struct vlan_hdr);
>>> +#endif
>>>  }
>>>
>>>  /* Removes outermost VLAN header (if any is present) from 'packet'.
>>> @@ -221,6 +230,11 @@ eth_pop_vlan(struct dp_packet *packet)
>>>          memmove((char *)veh + VLAN_HEADER_LEN, veh, 2 * ETH_ADDR_LEN);
>>>          dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
>>>      }
>>> +
>>> +#ifdef DPDK_NETDEV
>>> +    struct rte_mbuf *pkt = &(packet->mbuf);
>>> +    pkt->l2_len -= sizeof(struct vlan_hdr);
>>> +#endif
>>>  }
>>>
>>>  /* Set ethertype of the packet. */
>>
>>Would it be better to change the dp_packet_resize_l2 call?  Are you
>
> Hey Aaron,
>
> Good call - that would definitely be a more suitable location.
>
>>worried about the mpls case?
>
> I haven't considered mpls here at all, as it wasn't part of the use
> case for which this new version of the patch was produced I'm afraid.
> Out of curiosity, what is your concern here?

The two users of dp_packet_resize_l2 are the vlan and the mpls code.  I
had assumed you skipped for that reason, but maybe that isn't the case.

> Thanks in advance,
> Mark
Mark Kavanagh April 13, 2017, 3:52 p.m. UTC | #4
>
>"Kavanagh, Mark B" <mark.b.kavanagh@intel.com> writes:
>
>>>Hi Mark,
>>>
>>>Mark Kavanagh <mark.b.kavanagh@intel.com> writes:
>>>
>>>> TCP Segmentation Offload (TSO) is a feature which enables
>>>> the TCP/IP network stack to delegate segmentation of a TCP
>>>> segment to the NIC, thus saving compute resources.
>>>>
>>>> This commit adds support for TSO in the DPDK vHost-User backend,
>>>> to OvS v2.6.1; this enables a guest to offload segmentation of
>>>> TCP segments that it sends to OvS.
>>>>
>>>> This patch is not intended for upstreaming, but rather was produced
>>>> in response to requests for an updated version of the initial TSO RFC
>>>> patch posted here:
>>>> https://mail.openvswitch.org/pipermail/ovs-dev/2016-June/316414.html
>>>>
>>>> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>>>> ---
>>>
>>>...
>>>
>>>> diff --git a/lib/packets.c b/lib/packets.c
>>>> index e4c29d5..2417ba2 100644
>>>> --- a/lib/packets.c
>>>> +++ b/lib/packets.c
>>>> @@ -33,6 +33,10 @@
>>>>  #include "dp-packet.h"
>>>>  #include "unaligned.h"
>>>>
>>>> +#ifdef DPDK_NETDEV
>>>> +#include "rte_ether.h"
>>>> +#endif
>>>> +
>>>>  const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
>>>>  const struct in6_addr in6addr_all_hosts = IN6ADDR_ALL_HOSTS_INIT;
>>>>
>>>> @@ -204,6 +208,11 @@ eth_push_vlan(struct dp_packet *packet, ovs_be16 tpid, ovs_be16 tci)
>>>>      memmove(veh, (char *)veh + VLAN_HEADER_LEN, 2 * ETH_ADDR_LEN);
>>>>      veh->veth_type = tpid;
>>>>      veh->veth_tci = tci & htons(~VLAN_CFI);
>>>> +
>>>> +#ifdef DPDK_NETDEV
>>>> +    struct rte_mbuf *pkt = &(packet->mbuf);
>>>> +    pkt->l2_len += sizeof(struct vlan_hdr);
>>>> +#endif
>>>>  }
>>>>
>>>>  /* Removes outermost VLAN header (if any is present) from 'packet'.
>>>> @@ -221,6 +230,11 @@ eth_pop_vlan(struct dp_packet *packet)
>>>>          memmove((char *)veh + VLAN_HEADER_LEN, veh, 2 * ETH_ADDR_LEN);
>>>>          dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
>>>>      }
>>>> +
>>>> +#ifdef DPDK_NETDEV
>>>> +    struct rte_mbuf *pkt = &(packet->mbuf);
>>>> +    pkt->l2_len -= sizeof(struct vlan_hdr);
>>>> +#endif
>>>>  }
>>>>
>>>>  /* Set ethertype of the packet. */
>>>
>>>Would it be better to change the dp_packet_resize_l2 call?  Are you
>>
>> Hey Aaron,
>>
>> Good call - that would definitely be a more suitable location.
>>
>>>worried about the mpls case?
>>
>> I haven't considered mpls here at all, as it wasn't part of the use
>> case for which this new version of the patch was produced I'm afraid.
>> Out of curiosity, what is your concern here?
>
>The two users of dp_packet_resize_l2 are the vlan and the mpls code.  I
>had assumed you skipped for that reason, but maybe that isn't the case.

Yeah, I hadn't considered mpls (intentionally); if dp_packet_resize_l2 is invoked as part of mpls, then the code is better as-is.

Thanks again,
Mark
>
>> Thanks in advance,
>> Mark
diff mbox

Patch

diff --git a/INSTALL.DPDK-ADVANCED.md b/INSTALL.DPDK-ADVANCED.md
index e3603a1..a0737a2 100755
--- a/INSTALL.DPDK-ADVANCED.md
+++ b/INSTALL.DPDK-ADVANCED.md
@@ -14,7 +14,8 @@  OVS DPDK ADVANCED INSTALL GUIDE
 9. [Flow Control](#fc)
 10. [Pdump](#pdump)
 11. [Jumbo Frames](#jumbo)
-12. [Vsperf](#vsperf)
+12. [TCP Segmentation Offload (TSO)](#tso)
+13. [Vsperf](#vsperf)
 
 ## <a name="overview"></a> 1. Overview
 
@@ -856,7 +857,106 @@  vhost ports:
      ifconfig eth1 mtu 9000
      ```
 
-## <a name="vsperf"></a> 12. Vsperf
+## <a name="tso"></a> 12. TCP Segmentation Offload (TSO)
+
+### 12.1 Overview
+
+TCP Segmentation Offload (TSO) enables a network stack to delegate
+segmentation of an oversized TCP segment to the underlying physical NIC.
+Offload of frame segmentation achieves computational savings in the core,
+freeing up CPU cycles for more useful work.
+
+DPDK v16.07 added support for `TSO` in the vHost user backend; as such, a
+guest's virtual network interfaces may avail of `TSO`. In such a setup, the
+aforementioned computational savings are made in the core acting as the VM's
+virtual CPU, typically resulting in improved TCP throughput.
+
+To enable TSO in a guest, the underlying NIC must first support `TSO` -
+consult your controller's datasheet for compatibility. Secondly, the NIC
+must have an associated DPDK Poll Mode Driver (PMD) which supports `TSO`.
+
+### 12.2 Enabling TSO
+
+TSO may be enabled in one of two ways, as follows:
+
+  1. QEMU Command Line Parameter:
+
+      ```
+      sudo $QEMU_DIR/x86_64-softmmu/qemu-system-x86_64 \
+      ...
+      -device virtio-net-pci,mac=00:00:00:00:00:01,netdev=mynet1,\
+      mrg_rxbuf=on,csum=on,gso=on,guest_csum=on,guest_tso4=on,\
+       guest_tso6=on,guest_ecn=on \
+       ...
+      ```
+
+  2. ethtool
+`TSO` is enabled in OvS by the DPDK vHost User backed; when a new guest
+connection is established, `TSO` is advertised to the guest as an available
+feature. Assuming that the guest's OS also supports `TSO`, ethtool can be used
+to enable same:
+
+      ```
+      ethtool -K eth0 sg on     # scatter-gather is a prerequisite for TSO
+      ethtool -K eth0 tso on
+      ethtool -k eth0           # verify that TSO is reported as 'on'
+      ```
+
+      <b>Note:</b> In both methods, `mergeable buffers` are required:
+      ```
+      sudo $QEMU_DIR/x86_64-softmmu/qemu-system-x86_64 \
+      ...
+      mrg_rxbuf=on,\
+       ...
+      ```
+
+### 12.3 Performance Tuning
+
+For optimal performance, `TSO` can be used in conjunction with
+`Jumbo Frames`:
+
+ ```
+ Guest 1                               Guest 2
+ +---------------------------------+   +---------------------------------+
+ |                                 |   |                                 |
+ |  ethtool -K eth0 sg on          |   |  ethtool -K eth1 sg on          |
+ |  ethtool -K eth0 tso on         |   |  ethtool -K eth1 tso on         |
+ |  ifconfig eth0 mtu 9000 (flat)  |   |  ifconfig eth1 mtu 9000 (flat)  |
+ |              OR                 |   |              OR                 |
+ |  ifconfig eth0 mtu 8996 (vlan)  |   |  ifconfig eth1 mtu 8996 (vlan)  |
+ |                                 |   |                                 |
+ |             +------+            |   |            +------+             |
+ |             | eth0 |            |   |            | eth1 |             |
+ +---------------------------------+   +---------------------------------+
+                 ^ ^                                   ^ ^
+                 | |                                   | |
+                 | |                                   | |
+ Host 1          ∨ ∨                   Host 2          ∨ ∨
+ +---------------------------------+   +---------------------------------+
+ |           | dpdkvhu0 |          |   |          | dpdkvhu1 |           |
+ |           +----------+          |   |          +----------+           |
+ |                                 |   |                                 |
+ | ovs-vsctl set Interface \       |   | ovs-vsctl set Interface \       |
+ | dpdkvhu0 mtu_request = 9000     |   | dpdkvhu1 mtu_request = 9000     |
+ |                                 |   |                                 |
+ | ovs-vsctl set Interface dpdk0 \ |   | ovs-vsctl set Interface dpdk1 \ |
+ | mtu_request = 9000              |   | mtu_request = 9000              |
+ |                                 |   |                                 |
+ |            +-------+            |   |            +-------+            |
+ |            | dpdk0 |            |   |            | dpdk1 |            |
+ +---------------------------------+   +---------------------------------+
+                 ^ ^                                   ^ ^
+                 | |___________________________________| |
+                 |_______________________________________|
+```
+
+### 12.4 Limitations
+
+The current OvS `TSO` implementation supports flat and VLAN networks only
+(i.e. no support for `TSO` over tunneled connection [VxLAN, GRE, IPinIP, etc.]).
+
+
+## <a name="vsperf"></a> 13. Vsperf
 
 Vsperf project goal is to develop vSwitch test framework that can be used to
 validate the suitability of different vSwitch implementations in a Telco deployment
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 7c1e637..3e39ff0 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -409,17 +409,16 @@  dp_packet_size(const struct dp_packet *b)
 static inline void
 dp_packet_set_size(struct dp_packet *b, uint32_t v)
 {
-    /* netdev-dpdk does not currently support segmentation; consequently, for
-     * all intents and purposes, 'data_len' (16 bit) and 'pkt_len' (32 bit) may
-     * be used interchangably.
-     *
-     * On the datapath, it is expected that the size of packets
-     * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
-     * loss of accuracy in assigning 'v' to 'data_len'.
-     */
-    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
-    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
-                                      * this segment. */
+    /* Assign current segment length. If total length is greater than
+     * (mbuf.buf_len - mbuf.data_off), then additional calculation is needed */
+    if (v > (b->mbuf.buf_len - b->mbuf.data_off)) {
+        b->mbuf.data_len = (uint16_t) (b->mbuf.buf_len - b->mbuf.data_off);
+    } else {
+        b->mbuf.data_len = (uint16_t) v;
+    }
+
+    /* Total length of all segments linked to this segment. */
+    b->mbuf.pkt_len = v;
 }
 
 static inline uint16_t
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 27b4ee2..686907f 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -55,11 +55,13 @@ 
 #include "unixctl.h"
 
 #include "rte_config.h"
+#include "rte_ip.h"
 #include "rte_mbuf.h"
 #include "rte_meter.h"
 #ifdef DPDK_PDUMP
 #include "rte_pdump.h"
 #endif
+#include "rte_tcp.h"
 #include "rte_virtio_net.h"
 
 VLOG_DEFINE_THIS_MODULE(dpdk);
@@ -620,6 +622,8 @@  dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
     int diag = 0;
     int i;
     struct rte_eth_conf conf = port_conf;
+    struct rte_eth_txconf *txconf;
+    struct rte_eth_dev_info dev_info;
 
     if (dev->mtu > ETHER_MTU) {
         conf.rxmode.jumbo_frame = 1;
@@ -645,9 +649,17 @@  dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
             break;
         }
 
+        rte_eth_dev_info_get(dev->port_id, &dev_info);
+        txconf = &dev_info.default_txconf;
+        /*
+         * The default value of txq_flags disables DPDK offload features.
+         * Set it to 0 to enable offloads.
+         */
+        txconf->txq_flags = 0;
+
         for (i = 0; i < n_txq; i++) {
             diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
-                                          dev->socket_id, NULL);
+                                          dev->socket_id, txconf);
             if (diag) {
                 VLOG_INFO("Interface %s txq(%d) setup error: %s",
                           dev->up.name, i, rte_strerror(-diag));
@@ -1198,6 +1210,32 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+/* Perform prerequisites for TSO:
+ *  - set IP checksum offload flag for IPv4 packets
+ *  - calculate TCP pseudoheader for IPv4 and IPv6 packets
+ */
+static void
+netdev_dpdk_prep_tso_pkt(struct rte_mbuf *pkt)
+{
+    struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+    void *l3_hdr = (char *) eth_hdr + pkt->l2_len;
+    struct tcp_hdr *tcp_hdr = (struct tcp_hdr *) ((char*) l3_hdr+ pkt->l3_len);
+
+    tcp_hdr->cksum = 0;
+    if (pkt->ol_flags & PKT_TX_IPV4){
+        /* IPv4 packet */
+        struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *) l3_hdr;
+        ipv4_hdr->hdr_checksum = 0;
+        pkt->ol_flags |= PKT_TX_IP_CKSUM;
+        tcp_hdr->cksum = (rte_ipv4_phdr_cksum(
+                         (struct ipv4_hdr*) l3_hdr, pkt->ol_flags));
+    } else {
+        /* IPv6 packet */
+        tcp_hdr->cksum = (rte_ipv6_phdr_cksum(
+                         (struct ipv6_hdr*) l3_hdr, pkt->ol_flags));
+    }
+}
+
 static inline void
 netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
                              struct rte_mbuf **pkts, int cnt)
@@ -1451,11 +1489,18 @@  netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
 
     for (i = 0; i < pkt_cnt; i++) {
         pkt = pkts[i];
+
+        /* Filter oversized packets, unless are marked for TSO.
+         * In this case, frames will not actually traverse the NIC, but
+         * rather travel between VMs on the same host.
+         */
         if (OVS_UNLIKELY(pkt->pkt_len > dev->max_packet_len)) {
-            VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
-                         dev->up.name, pkt->pkt_len, dev->max_packet_len);
-            rte_pktmbuf_free(pkt);
-            continue;
+                if (OVS_LIKELY(!(pkt->ol_flags & PKT_TX_TCP_SEG))) {
+                    VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
+                            dev->up.name, pkt->pkt_len, dev->max_packet_len);
+                    rte_pktmbuf_free(pkt);
+                    continue;
+                }
         }
 
         if (OVS_UNLIKELY(i != cnt)) {
@@ -1662,9 +1707,14 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
 
         for (int i = 0; i < cnt; i++) {
             int size = dp_packet_size(batch->packets[i]);
+            struct rte_mbuf *pkt = &batch->packets[i]->mbuf;
 
             if (OVS_UNLIKELY(size > dev->max_packet_len)) {
-                if (next_tx_idx != i) {
+                /* Permit oversized frames that are marked for TSO */
+                if (OVS_UNLIKELY(pkt->ol_flags & PKT_TX_TCP_SEG)) {
+                    netdev_dpdk_prep_tso_pkt(pkt);
+                } else {
+                    if (next_tx_idx != i) {
                     temp_cnt = i - next_tx_idx;
                     qos_pkts = temp_cnt;
 
@@ -1676,14 +1726,15 @@  netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
                             (struct rte_mbuf **)&batch->packets[next_tx_idx],
                             temp_cnt);
 
-                }
+                    }
 
-                VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
-                             (int)size , dev->max_packet_len);
+                    VLOG_WARN_RL(&rl, "Too big size %d max_packet_len %d",
+                            (int)size , dev->max_packet_len);
 
-                dp_packet_delete(batch->packets[i]);
-                dropped++;
-                next_tx_idx = i + 1;
+                    dp_packet_delete(batch->packets[i]);
+                    dropped++;
+                    next_tx_idx = i + 1;
+                }
             }
         }
         if (next_tx_idx != cnt) {
@@ -2560,10 +2611,6 @@  static int
 dpdk_vhost_class_init(void)
 {
     rte_vhost_driver_callback_register(&virtio_net_device_ops);
-    rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_HOST_TSO4
-                            | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                            | 1ULL << VIRTIO_NET_F_CSUM);
-
     ovs_thread_create("vhost_thread", start_vhost_loop, NULL);
     return 0;
 }
diff --git a/lib/packets.c b/lib/packets.c
index e4c29d5..2417ba2 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -33,6 +33,10 @@ 
 #include "dp-packet.h"
 #include "unaligned.h"
 
+#ifdef DPDK_NETDEV
+#include "rte_ether.h"
+#endif
+
 const struct in6_addr in6addr_exact = IN6ADDR_EXACT_INIT;
 const struct in6_addr in6addr_all_hosts = IN6ADDR_ALL_HOSTS_INIT;
 
@@ -204,6 +208,11 @@  eth_push_vlan(struct dp_packet *packet, ovs_be16 tpid, ovs_be16 tci)
     memmove(veh, (char *)veh + VLAN_HEADER_LEN, 2 * ETH_ADDR_LEN);
     veh->veth_type = tpid;
     veh->veth_tci = tci & htons(~VLAN_CFI);
+
+#ifdef DPDK_NETDEV
+    struct rte_mbuf *pkt = &(packet->mbuf);
+    pkt->l2_len += sizeof(struct vlan_hdr);
+#endif
 }
 
 /* Removes outermost VLAN header (if any is present) from 'packet'.
@@ -221,6 +230,11 @@  eth_pop_vlan(struct dp_packet *packet)
         memmove((char *)veh + VLAN_HEADER_LEN, veh, 2 * ETH_ADDR_LEN);
         dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
     }
+
+#ifdef DPDK_NETDEV
+    struct rte_mbuf *pkt = &(packet->mbuf);
+    pkt->l2_len -= sizeof(struct vlan_hdr);
+#endif
 }
 
 /* Set ethertype of the packet. */