diff mbox

[ovs-dev,v2,3/3] datapath-windows: STT - Enable support for TCP Segmentation offloads

Message ID 1445903321-7864-4-git-send-email-vsairam@vmware.com
State Accepted
Headers show

Commit Message

Sairam Venugopal Oct. 26, 2015, 11:48 p.m. UTC
Add support to STT - Encap and Decap functions to reassemble the packet
fragments. Also add support to offload the packet to NDIS.

Signed-off-by: Sairam Venugopal <vsairam@vmware.com>
---
 datapath-windows/ovsext/Actions.c |  40 ++--
 datapath-windows/ovsext/Stt.c     | 398 +++++++++++++++++++++++++++++---------
 2 files changed, 329 insertions(+), 109 deletions(-)

Comments

Nithin Raju Oct. 27, 2015, 5:23 p.m. UTC | #1
Thanks for working on this. It is really awesome to have TSO support with
STT now!

Acked-by: Nithin Raju <nithin@vmware.com>

-----Original Message-----
From: Sairam Venugopal <vsairam@vmware.com>
Date: Tuesday, October 27, 2015 at 10:20 AM
To: Nithin Raju <nithin@vmware.com>
Subject: Fw: [PATCH v2 3/3] datapath-windows: STT - Enable support for TCP
Segmentation offloads

>
>________________________________________
>From: Sairam Venugopal <vsairam@vmware.com>
>Sent: Monday, October 26, 2015 4:48 PM
>To: dev@openvswitch.org
>Cc: Sairam Venugopal
>Subject: [PATCH v2 3/3] datapath-windows: STT - Enable support for TCP
>Segmentation offloads
>
>Add support to STT - Encap and Decap functions to reassemble the packet
>fragments. Also add support to offload the packet to NDIS.
>
>Signed-off-by: Sairam Venugopal <vsairam@vmware.com>
>---
> datapath-windows/ovsext/Actions.c |  40 ++--
> datapath-windows/ovsext/Stt.c     | 398
>+++++++++++++++++++++++++++++---------
> 2 files changed, 329 insertions(+), 109 deletions(-)
>
>diff --git a/datapath-windows/ovsext/Actions.c
>b/datapath-windows/ovsext/Actions.c
>index b4644a7..ce592b3 100644
>--- a/datapath-windows/ovsext/Actions.c
>+++ b/datapath-windows/ovsext/Actions.c
>@@ -594,7 +594,7 @@ OvsDoFlowLookupOutput(OvsForwardingContext *ovsFwdCtx)
>         InitializeListHead(&missedPackets);
>         status = OvsCreateAndAddPackets(NULL, 0, OVS_PACKET_CMD_MISS,
>vport,
>                           &key,ovsFwdCtx->curNbl,
>-                          ovsFwdCtx->tunnelRxNic != NULL,
>&ovsFwdCtx->layers,
>+                          FALSE, &ovsFwdCtx->layers,
>                           ovsFwdCtx->switchContext, &missedPackets,
>&num);
>         if (num) {
>             OvsQueuePackets(&missedPackets, num);
>@@ -709,6 +709,7 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
>     NDIS_STATUS status = NDIS_STATUS_SUCCESS;
>     PNET_BUFFER_LIST newNbl = NULL;
>     POVS_VPORT_ENTRY tunnelRxVport = ovsFwdCtx->tunnelRxNic;
>+    PCWSTR dropReason = L"OVS-dropped due to new decap packet";
>
>     if (OvsValidateIPChecksum(ovsFwdCtx->curNbl, &ovsFwdCtx->layers)
>             != NDIS_STATUS_SUCCESS) {
>@@ -730,6 +731,10 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
>     case OVS_VPORT_TYPE_STT:
>         status = OvsDecapStt(ovsFwdCtx->switchContext, ovsFwdCtx->curNbl,
>                              &ovsFwdCtx->tunKey, &newNbl);
>+        if (status == NDIS_STATUS_SUCCESS && newNbl == NULL) {
>+            /* This was an STT-LSO Fragment */
>+            dropReason = L"OVS-STT segment is cached";
>+        }
>         break;
>     default:
>         OVS_LOG_ERROR("Rx: Unhandled tunnel type: %d\n",
>@@ -747,25 +752,26 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
>      * tunnelRxNic and other fields will be cleared, re-init the context
>      * before usage.
>       */
>-    OvsCompleteNBLForwardingCtx(ovsFwdCtx,
>-                                L"OVS-dropped due to new decap packet");
>+    OvsCompleteNBLForwardingCtx(ovsFwdCtx, dropReason);
>
>-    /* Decapsulated packet is in a new NBL */
>-    ovsFwdCtx->tunnelRxNic = tunnelRxVport;
>-    OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext,
>-                         newNbl, tunnelRxVport->portNo, 0,
>-                 
>NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl),
>-                         ovsFwdCtx->completionList,
>-                         &ovsFwdCtx->layers, FALSE);
>+    if (newNbl) {
>+        /* Decapsulated packet is in a new NBL */
>+        ovsFwdCtx->tunnelRxNic = tunnelRxVport;
>+        OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext,
>+                             newNbl, tunnelRxVport->portNo, 0,
>+                 
>NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl),
>+                             ovsFwdCtx->completionList,
>+                             &ovsFwdCtx->layers, FALSE);
>
>-    /*
>-     * Set the NBL's SourcePortId and SourceNicIndex to default values to
>-     * keep NDIS happy when we forward the packet.
>-     */
>-    ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID;
>-    ovsFwdCtx->fwdDetail->SourceNicIndex = 0;
>+        /*
>+         * Set the NBL's SourcePortId and SourceNicIndex to default
>values to
>+         * keep NDIS happy when we forward the packet.
>+         */
>+        ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID;
>+        ovsFwdCtx->fwdDetail->SourceNicIndex = 0;
>
>-    status = OvsDoFlowLookupOutput(ovsFwdCtx);
>+        status = OvsDoFlowLookupOutput(ovsFwdCtx);
>+    }
>     ASSERT(ovsFwdCtx->curNbl == NULL);
>     OvsClearTunRxCtx(ovsFwdCtx);
>
>diff --git a/datapath-windows/ovsext/Stt.c b/datapath-windows/ovsext/Stt.c
>index b78ef95..ef44d23 100644
>--- a/datapath-windows/ovsext/Stt.c
>+++ b/datapath-windows/ovsext/Stt.c
>@@ -34,6 +34,7 @@
> #endif
> #define OVS_DBG_MOD OVS_DBG_STT
> #include "Debug.h"
>+#include "Jhash.h"
>
> KSTART_ROUTINE OvsSttDefragCleaner;
> static PLIST_ENTRY OvsSttPktFragHash;
>@@ -152,8 +153,8 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>     UINT32 headRoom = OvsGetSttTunHdrSize();
>     UINT32 tcpChksumLen;
>     PUINT8 bufferStart;
>-
>-    UNREFERENCED_PARAMETER(layers);
>+    ULONG mss = 0;
>+    NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
>
>     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>
>@@ -162,14 +163,20 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>     BOOLEAN innerPartialChecksum = FALSE;
>
>     if (layers->isTcp) {
>-        NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
>-
>         lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
>                 TcpLargeSendNetBufferListInfo);
>-        if (lsoInfo.LsoV1Transmit.MSS) {
>-            /* XXX We don't handle LSO yet */
>-            OVS_LOG_ERROR("LSO on STT is not supported");
>-            return NDIS_STATUS_FAILURE;
>+
>+        switch (lsoInfo.Transmit.Type) {
>+            case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE:
>+                mss = lsoInfo.LsoV1Transmit.MSS;
>+                break;
>+            case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE:
>+                mss = lsoInfo.LsoV2Transmit.MSS;
>+                break;
>+            default:
>+                OVS_LOG_ERROR("Unknown LSO transmit type:%d",
>+                              lsoInfo.Transmit.Type);
>+                return NDIS_STATUS_FAILURE;
>         }
>     }
>
>@@ -186,21 +193,36 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>         return NDIS_STATUS_FAILURE;
>     }
>
>-    curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
>+    curNbl = *newNbl;
>+    curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
>+    /* NB Chain should be split before */
>+    ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
>+    innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
>+
>     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl,
>                                                        LowPagePriority);
>     bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
>
>-    if (layers->isIPv4 && csumInfo.Transmit.IpHeaderChecksum) {
>+    if (layers->isIPv4) {
>         IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
>-        ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
>+        if (!ip->tot_len) {
>+            ip->tot_len = htons(innerFrameLen - sizeof(EthHdr));
>+        }
>+        if (!ip->check) {
>+            ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
>+        }
>     }
>+
>     if (layers->isTcp) {
>-        if(!csumInfo.Transmit.TcpChecksum) {
>-            innerChecksumVerified = TRUE;
>-        } else {
>+        if (mss) {
>             innerPartialChecksum = TRUE;
>+        } else {
>+            if (!csumInfo.Transmit.TcpChecksum) {
>+                innerChecksumVerified = TRUE;
>+            } else {
>+                innerPartialChecksum = TRUE;
>+            }
>         }
>     } else if (layers->isUdp) {
>         if(!csumInfo.Transmit.UdpChecksum) {
>@@ -210,24 +232,6 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>         }
>     }
>
>-    curNbl = *newNbl;
>-    curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>-    /* NB Chain should be split before */
>-    ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
>-
>-    innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
>-    /*
>-     * External port can't be removed as we hold the dispatch lock
>-     * We also check if the external port was removed beforecalling
>-     * port encapsulation functions
>-     */
>-    if (innerFrameLen > OvsGetExternalMtu(switchContext) - headRoom) {
>-        OVS_LOG_ERROR("Packet too large (size %d, mtu %d). Can't
>encapsulate",
>-                innerFrameLen, OvsGetExternalMtu(switchContext));
>-        status = NDIS_STATUS_FAILURE;
>-        goto ret_error;
>-    }
>-
>     status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
>     if (status != NDIS_STATUS_SUCCESS) {
>         ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)");
>@@ -301,33 +305,52 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>                                           IPPROTO_TCP, (uint16)
>tcpChksumLen);
>     sttHdr->version = 0;
>
>-    /* XXX need to peek into the inner packet, hard code for now */
>-    sttHdr->flags = STT_PROTO_IPV4;
>-    if (innerChecksumVerified) {
>-        sttHdr->flags |= STT_CSUM_VERIFIED;
>-    } else if (innerPartialChecksum) {
>+    /* Set STT Header */
>+    sttHdr->flags = 0;
>+    if (innerPartialChecksum) {
>         sttHdr->flags |= STT_CSUM_PARTIAL;
>+        if (layers->isIPv4) {
>+            sttHdr->flags |= STT_PROTO_IPV4;
>+        }
>+        if (layers->isTcp) {
>+            sttHdr->flags |= STT_PROTO_TCP;
>+        }
>+        sttHdr->l4Offset = (UINT8) layers->l4Offset;
>+        sttHdr->mss = (UINT16) htons(mss);
>+    } else if (innerChecksumVerified) {
>+        sttHdr->flags = STT_CSUM_VERIFIED;
>+        sttHdr->l4Offset = 0;
>+        sttHdr->mss = 0;
>     }
>-    sttHdr->l4Offset = 0;
>
>     sttHdr->reserved = 0;
>-    /* XXX Used for large TCP packets.Not sure how it is used, clarify */
>-    sttHdr->mss = 0;
>     sttHdr->vlanTCI = 0;
>     sttHdr->key = tunKey->tunnelId;
>     /* Zero out stt padding */
>     *(uint16 *)(sttHdr + 1) = 0;
>
>     /* Offload IP and TCP checksum */
>+    ULONG tcpHeaderOffset = sizeof *outerEthHdr +
>+                        outerIpHdr->ihl * 4;
>     csumInfo.Value = 0;
>     csumInfo.Transmit.IpHeaderChecksum = 1;
>     csumInfo.Transmit.TcpChecksum = 1;
>     csumInfo.Transmit.IsIPv4 = 1;
>-    csumInfo.Transmit.TcpHeaderOffset = sizeof *outerEthHdr +
>-                                        outerIpHdr->ihl * 4;
>+    csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset;
>     NET_BUFFER_LIST_INFO(curNbl,
>                          TcpIpChecksumNetBufferListInfo) =
>csumInfo.Value;
>
>+    UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) -
>sizeof(TCPHdr);
>+    if (ipTotalLen > encapMss) {
>+        lsoInfo.Value = 0;
>+        lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset;
>+        lsoInfo.LsoV2Transmit.MSS = encapMss;
>+        lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
>+        lsoInfo.LsoV2Transmit.IPVersion =
>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
>+        NET_BUFFER_LIST_INFO(curNbl,
>+                             TcpLargeSendNetBufferListInfo) =
>lsoInfo.Value;
>+    }
>+
>     return STATUS_SUCCESS;
>
> ret_error:
>@@ -338,16 +361,22 @@ ret_error:
>
> /*
>  
>*-------------------------------------------------------------------------
>---
>- * OvsCalculateTCPChecksum
>- *     Calculate TCP checksum
>+ * OvsValidateTCPChecksum
>+ *     Validate TCP checksum
>  
>*-------------------------------------------------------------------------
>---
>  */
> static __inline NDIS_STATUS
>-OvsCalculateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
>+OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
> {
>     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
>     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
>                  
>TcpIpChecksumNetBufferListInfo);
>+
>+    /* Check if NIC has indicated TCP checksum failure */
>+    if (csumInfo.Receive.TcpChecksumFailed) {
>+        return NDIS_STATUS_INVALID_PACKET;
>+    }
>+
>     UINT16 checkSum;
>
>     /* Check if TCP Checksum has been calculated by NIC */
>@@ -399,10 +428,9 @@ OvsInitSttDefragmentation()
>     NdisAllocateSpinLock(&OvsSttSpinLock);
>
>     /* Init the Hash Buffer */
>-    OvsSttPktFragHash = (PLIST_ENTRY) OvsAllocateMemoryWithTag(
>-                                                sizeof(LIST_ENTRY)
>-                                                * STT_HASH_TABLE_SIZE,
>-                                                OVS_STT_POOL_TAG);
>+    OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY)
>+                                                 * STT_HASH_TABLE_SIZE,
>+                                                 OVS_STT_POOL_TAG);
>     if (OvsSttPktFragHash == NULL) {
>         NdisFreeSpinLock(&OvsSttSpinLock);
>         return STATUS_INSUFFICIENT_RESOURCES;
>@@ -487,6 +515,7 @@ OvsSttDefragCleaner(PVOID data)
>                 entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
>                 if (entry->timeout < currentTime) {
>                     RemoveEntryList(&entry->link);
>+                    OvsFreeMemoryWithTag(entry->packetBuf,
>OVS_STT_POOL_TAG);
>                     OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG);
>                 }
>             }
>@@ -500,6 +529,158 @@ OvsSttDefragCleaner(PVOID data)
>     PsTerminateSystemThread(STATUS_SUCCESS);
> }
>
>+static OVS_STT_PKT_KEY
>+OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr)
>+{
>+    OVS_STT_PKT_KEY key;
>+    key.sAddr = ipHdr->saddr;
>+    key.dAddr = ipHdr->daddr;
>+    key.ackSeq = ntohl(tcpHdr->ack_seq);
>+    return key;
>+}
>+
>+static UINT32
>+OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey)
>+{
>+    UINT32 arr[3];
>+    arr[0] = pktKey->ackSeq;
>+    arr[1] = pktKey->dAddr;
>+    arr[2] = pktKey->sAddr;
>+    return OvsJhashWords(arr, 3, OVS_HASH_BASIS);
>+}
>+
>+static VOID *
>+OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash)
>+{
>+    PLIST_ENTRY link;
>+    POVS_STT_PKT_ENTRY entry;
>+
>+    LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) {
>+        entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
>+        if (entry->ovsPktKey.ackSeq == pktKey->ackSeq &&
>+            entry->ovsPktKey.dAddr == pktKey->dAddr &&
>+            entry->ovsPktKey.sAddr == pktKey->sAddr) {
>+            return entry;
>+        }
>+    }
>+    return NULL;
>+}
>+
>+/*
>+*
>+-------------------------------------------------------------------------
>-
>+* OvsSttReassemble --
>+*     Reassemble an LSO packet from multiple STT-Fragments.
>+*
>+-------------------------------------------------------------------------
>-
>+*/
>+PNET_BUFFER_LIST
>+OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext,
>+                 PNET_BUFFER_LIST curNbl,
>+                 IPHdr *ipHdr,
>+                 TCPHdr *tcp,
>+                 SttHdr *newSttHdr,
>+                 UINT16 payloadLen)
>+{
>+    UINT32 seq = ntohl(tcp->seq);
>+    UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN;
>+    UINT32 segOffset = STT_SEGMENT_OFF(seq);
>+    UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN;
>+    UINT32 startOffset = 0;
>+    OVS_STT_PKT_ENTRY *pktFragEntry;
>+    PNET_BUFFER_LIST targetPNbl = NULL;
>+    BOOLEAN lastPacket = FALSE;
>+    PNET_BUFFER sourceNb;
>+    UINT32 fragmentLength = payloadLen;
>+    SttHdr stt;
>+    SttHdr *sttHdr = NULL;
>+    sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>+
>+    /* XXX optimize this lock */
>+    NdisAcquireSpinLock(&OvsSttSpinLock);
>+
>+    /* If this is the first fragment, copy the STT header */
>+    if (segOffset == 0) {
>+        sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0);
>+        if (sttHdr == NULL) {
>+            OVS_LOG_ERROR("Unable to retrieve STT header");
>+            return NULL;
>+        }
>+        fragmentLength = fragmentLength - STT_HDR_LEN;
>+        startOffset = startOffset + STT_HDR_LEN;
>+    }
>+
>+    /* Lookup fragment */
>+    OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp);
>+    UINT32 hash = OvsSttGetPktHash(&pktKey);
>+    pktFragEntry = OvsLookupPktFrag(&pktKey, hash);
>+
>+    if (pktFragEntry == NULL) {
>+        /* Create a new Packet Entry */
>+        POVS_STT_PKT_ENTRY entry;
>+        entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY),
>+                                         OVS_STT_POOL_TAG);
>+        RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY));
>+
>+        /* Update Key, timestamp and recvdLen */
>+        NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof
>(OVS_STT_PKT_KEY));
>+
>+        entry->recvdLen = fragmentLength;
>+
>+        UINT64 currentTime;
>+        NdisGetCurrentSystemTime((LARGE_INTEGER *) &currentTime);
>+        entry->timeout = currentTime + STT_ENTRY_TIMEOUT;
>+
>+        if (segOffset == 0) {
>+            entry->sttHdr = *sttHdr;
>+        }
>+
>+        /* Copy the data from Source to new buffer */
>+        entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen,
>+                                                    OVS_STT_POOL_TAG);
>+        if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
>+                              entry->packetBuf + offset) == NULL) {
>+            OVS_LOG_ERROR("Error when obtaining bytes from Packet");
>+            goto handle_error;
>+        }
>+
>+        /* Insert the entry in the Static Buffer */
>+        InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK],
>+                       &entry->link);
>+    } else {
>+        /* Add to recieved length to identify if this is the last
>fragment */
>+        pktFragEntry->recvdLen += fragmentLength;
>+        lastPacket = (pktFragEntry->recvdLen == innerPacketLen);
>+
>+        if (segOffset == 0) {
>+            pktFragEntry->sttHdr = *sttHdr;
>+        }
>+
>+        /* Copy the fragment data from Source to existing buffer */
>+        if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
>+                              pktFragEntry->packetBuf + offset) == NULL)
>{
>+            OVS_LOG_ERROR("Error when obtaining bytes from Packet");
>+            goto handle_error;
>+        }
>+    }
>+
>+handle_error:
>+    if (lastPacket) {
>+        /* Retrieve the original STT header */
>+        NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof
>(SttHdr));
>+        targetPNbl = OvsAllocateNBLFromBuffer(switchContext,
>pktFragEntry->packetBuf,
>+                                              innerPacketLen);
>+
>+        /* Delete this entry and free up the memory/ */
>+        RemoveEntryList(&pktFragEntry->link);
>+        OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG);
>+        OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG);
>+    }
>+
>+    NdisReleaseSpinLock(&OvsSttSpinLock);
>+    return lastPacket ? targetPNbl : NULL;
>+}
>+
> /*
>  * 
>--------------------------------------------------------------------------
>  * OvsDecapStt --
>@@ -513,34 +694,20 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>             PNET_BUFFER_LIST *newNbl)
> {
>     NDIS_STATUS status = NDIS_STATUS_FAILURE;
>-    PNET_BUFFER curNb;
>+    PNET_BUFFER curNb, newNb;
>     IPHdr *ipHdr;
>     char *ipBuf[sizeof(IPHdr)];
>+    SttHdr stt;
>     SttHdr *sttHdr;
>     char *sttBuf[STT_HDR_LEN];
>     UINT32 advanceCnt, hdrLen;
>-    NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
>+    BOOLEAN isLsoPacket = FALSE;
>
>     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
>
>-    if (NET_BUFFER_DATA_LENGTH(curNb) < OvsGetSttTunHdrSize()) {
>-        OVS_LOG_ERROR("Packet length received is less than the tunnel
>header:"
>-            " %d<%d\n", NET_BUFFER_DATA_LENGTH(curNb),
>OvsGetSttTunHdrSize());
>-        return NDIS_STATUS_INVALID_LENGTH;
>-    }
>-
>-    /* Verify outer TCP Checksum */
>-    csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
>-                 
>TcpIpChecksumNetBufferListInfo);
>-
>-    /* Check if NIC has indicated TCP checksum failure */
>-    if (csumInfo.Receive.TcpChecksumFailed) {
>-        return NDIS_STATUS_INVALID_PACKET;
>-    }
>-
>-    /* Calculate the TCP Checksum */
>-    status = OvsCalculateTCPChecksum(curNbl, curNb);
>+    /* Validate the TCP Checksum */
>+    status = OvsValidateTCPChecksum(curNbl, curNb);
>     if (status != NDIS_STATUS_SUCCESS) {
>         return status;
>     }
>@@ -554,34 +721,73 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>                                                     1 /*no align*/, 0);
>     ASSERT(ipHdr);
>
>+    TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4);
>+
>     /* Skip IP & TCP headers */
>     hdrLen = sizeof(IPHdr) + sizeof(TCPHdr),
>     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
>     advanceCnt += hdrLen;
>
>-    /* STT Header */
>-    sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, (PVOID) &sttBuf,
>-                                                    1 /*no align*/, 0);
>+    UINT32 seq = ntohl(tcp->seq);
>+    UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT);
>+    UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len)
>+                        - (ipHdr->ihl * 4)
>+                        - (sizeof * tcp);
>+
>+    /* Check if incoming packet requires reassembly */
>+    if (totalLen != payloadLen) {
>+        sttHdr = &stt;
>+        PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl,
>+                                                 ipHdr, tcp, sttHdr,
>+                                                 payloadLen);
>+        if (pNbl == NULL) {
>+            return NDIS_STATUS_SUCCESS;
>+        }
>+
>+        *newNbl = pNbl;
>+        isLsoPacket = TRUE;
>+    } else {
>+        /* STT Header */
>+        sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr,
>+                                   (PVOID) &sttBuf, 1 /*no align*/, 0);
>+        /* Skip stt header, DataOffset points to inner pkt now. */
>+        hdrLen = STT_HDR_LEN;
>+        NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
>+        advanceCnt += hdrLen;
>+
>+        *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0,
>+                                    0, FALSE /*copy NBL info*/);
>+    }
>+
>+    if (*newNbl == NULL) {
>+        OVS_LOG_ERROR("Unable to allocate a new cloned NBL");
>+        return NDIS_STATUS_RESOURCES;
>+    }
>+
>+    status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
>+    if (status != NDIS_STATUS_SUCCESS) {
>+        OvsCompleteNBL(switchContext, *newNbl, TRUE);
>+        return NDIS_STATUS_FAILURE;
>+    }
>+    newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
>+
>     ASSERT(sttHdr);
>
>     /* Initialize the tunnel key */
>     tunKey->dst = ipHdr->daddr;
>     tunKey->src = ipHdr->saddr;
>     tunKey->tunnelId = sttHdr->key;
>-    tunKey->flags = (OVS_TNL_F_CSUM | OVS_TNL_F_KEY);
>+    tunKey->flags = OVS_TNL_F_KEY;
>     tunKey->tos = ipHdr->tos;
>     tunKey->ttl = ipHdr->ttl;
>     tunKey->pad = 0;
>
>-    /* Skip stt header, DataOffset points to inner pkt now. */
>-    hdrLen = STT_HDR_LEN;
>-    NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
>-    advanceCnt += hdrLen;
>+    BOOLEAN requiresLSO = sttHdr->mss != 0;
>
>     /* Verify checksum for inner packet if it's required */
>     if (!(sttHdr->flags & STT_CSUM_VERIFIED)) {
>         BOOLEAN innerChecksumPartial = sttHdr->flags & STT_CSUM_PARTIAL;
>-        EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr),
>+        EthHdr *eth = (EthHdr *)NdisGetDataBuffer(newNb, sizeof(EthHdr),
>                                                   NULL, 1, 0);
>
>         /* XXX Figure out a way to offload checksum receives */
>@@ -597,14 +803,16 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>                                                   IPPROTO_TCP,
>                                                   (UINT16)l4Payload);
>                 }
>-                tcp->check = CalculateChecksumNB(curNb, l4Payload,
>offset);
>+                if (!requiresLSO) {
>+                    tcp->check = CalculateChecksumNB(newNb, l4Payload,
>offset);
>+                }
>             } else if (ip->protocol == IPPROTO_UDP) {
>                 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
>                 if (!innerChecksumPartial){
>                     udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
>                                                   IPPROTO_UDP,
>l4Payload);
>                 }
>-                udp->check = CalculateChecksumNB(curNb, l4Payload,
>offset);
>+                udp->check = CalculateChecksumNB(newNb, l4Payload,
>offset);
>             }
>         } else if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV6)) {
>             IPv6Hdr *ip = (IPv6Hdr *)((PCHAR)eth + sizeof *eth);
>@@ -617,7 +825,9 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>                                                     (UINT32 *)&ip->daddr,
>                                                     IPPROTO_TCP,
>totalLength);
>                 }
>-                tcp->check = CalculateChecksumNB(curNb, totalLength,
>offset);
>+                if (!requiresLSO) {
>+                    tcp->check = CalculateChecksumNB(newNb, totalLength,
>offset);
>+                }
>             }
>             else if (ip->nexthdr == IPPROTO_UDP) {
>                 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
>@@ -626,23 +836,27 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>                                                     (UINT32 *)&ip->daddr,
>                                                     IPPROTO_UDP,
>totalLength);
>                 }
>-                udp->check = CalculateChecksumNB(curNb, totalLength,
>offset);
>+                udp->check = CalculateChecksumNB(newNb, totalLength,
>offset);
>             }
>         }
>
>-        NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
>+        NET_BUFFER_LIST_INFO(*newNbl, TcpIpChecksumNetBufferListInfo) =
>0;
>     }
>
>-    *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
>OVS_DEFAULT_COPY_SIZE,
>-                                0, FALSE /*copy NBL info*/);
>-
>-    ASSERT(advanceCnt == OvsGetSttTunHdrSize());
>-    status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
>-
>-    if (*newNbl == NULL) {
>-        OVS_LOG_ERROR("OvsDecapStt: Unable to allocate a new cloned
>NBL");
>-        status = NDIS_STATUS_RESOURCES;
>+    if (requiresLSO) {
>+        NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
>+        lsoInfo.Value = 0;
>+        lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset;
>+        lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU - sizeof(IPHdr) -
>sizeof(TCPHdr);
>+        lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
>+        if (sttHdr->flags & STT_PROTO_IPV4) {
>+            lsoInfo.LsoV2Transmit.IPVersion =
>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
>+        } else {
>+            lsoInfo.LsoV2Transmit.IPVersion =
>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6;
>+        }
>+        NET_BUFFER_LIST_INFO(*newNbl,
>+                                TcpLargeSendNetBufferListInfo) =
>lsoInfo.Value;
>     }
>
>-    return status;
>+    return NDIS_STATUS_SUCCESS;
> }
>--
>1.9.5.msysgit.0
>
Gurucharan Shetty Oct. 27, 2015, 8:49 p.m. UTC | #2
> Acked-by: Nithin Raju <nithin@vmware.com>

Series applied.

>
> -----Original Message-----
> From: Sairam Venugopal <vsairam@vmware.com>
> Date: Tuesday, October 27, 2015 at 10:20 AM
> To: Nithin Raju <nithin@vmware.com>
> Subject: Fw: [PATCH v2 3/3] datapath-windows: STT - Enable support for TCP
> Segmentation offloads
>
>>
>>________________________________________
>>From: Sairam Venugopal <vsairam@vmware.com>
>>Sent: Monday, October 26, 2015 4:48 PM
>>To: dev@openvswitch.org
>>Cc: Sairam Venugopal
>>Subject: [PATCH v2 3/3] datapath-windows: STT - Enable support for TCP
>>Segmentation offloads
>>
>>Add support to STT - Encap and Decap functions to reassemble the packet
>>fragments. Also add support to offload the packet to NDIS.
>>
>>Signed-off-by: Sairam Venugopal <vsairam@vmware.com>
>>---
>> datapath-windows/ovsext/Actions.c |  40 ++--
>> datapath-windows/ovsext/Stt.c     | 398
>>+++++++++++++++++++++++++++++---------
>> 2 files changed, 329 insertions(+), 109 deletions(-)
>>
>>diff --git a/datapath-windows/ovsext/Actions.c
>>b/datapath-windows/ovsext/Actions.c
>>index b4644a7..ce592b3 100644
>>--- a/datapath-windows/ovsext/Actions.c
>>+++ b/datapath-windows/ovsext/Actions.c
>>@@ -594,7 +594,7 @@ OvsDoFlowLookupOutput(OvsForwardingContext *ovsFwdCtx)
>>         InitializeListHead(&missedPackets);
>>         status = OvsCreateAndAddPackets(NULL, 0, OVS_PACKET_CMD_MISS,
>>vport,
>>                           &key,ovsFwdCtx->curNbl,
>>-                          ovsFwdCtx->tunnelRxNic != NULL,
>>&ovsFwdCtx->layers,
>>+                          FALSE, &ovsFwdCtx->layers,
>>                           ovsFwdCtx->switchContext, &missedPackets,
>>&num);
>>         if (num) {
>>             OvsQueuePackets(&missedPackets, num);
>>@@ -709,6 +709,7 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
>>     NDIS_STATUS status = NDIS_STATUS_SUCCESS;
>>     PNET_BUFFER_LIST newNbl = NULL;
>>     POVS_VPORT_ENTRY tunnelRxVport = ovsFwdCtx->tunnelRxNic;
>>+    PCWSTR dropReason = L"OVS-dropped due to new decap packet";
>>
>>     if (OvsValidateIPChecksum(ovsFwdCtx->curNbl, &ovsFwdCtx->layers)
>>             != NDIS_STATUS_SUCCESS) {
>>@@ -730,6 +731,10 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
>>     case OVS_VPORT_TYPE_STT:
>>         status = OvsDecapStt(ovsFwdCtx->switchContext, ovsFwdCtx->curNbl,
>>                              &ovsFwdCtx->tunKey, &newNbl);
>>+        if (status == NDIS_STATUS_SUCCESS && newNbl == NULL) {
>>+            /* This was an STT-LSO Fragment */
>>+            dropReason = L"OVS-STT segment is cached";
>>+        }
>>         break;
>>     default:
>>         OVS_LOG_ERROR("Rx: Unhandled tunnel type: %d\n",
>>@@ -747,25 +752,26 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
>>      * tunnelRxNic and other fields will be cleared, re-init the context
>>      * before usage.
>>       */
>>-    OvsCompleteNBLForwardingCtx(ovsFwdCtx,
>>-                                L"OVS-dropped due to new decap packet");
>>+    OvsCompleteNBLForwardingCtx(ovsFwdCtx, dropReason);
>>
>>-    /* Decapsulated packet is in a new NBL */
>>-    ovsFwdCtx->tunnelRxNic = tunnelRxVport;
>>-    OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext,
>>-                         newNbl, tunnelRxVport->portNo, 0,
>>-
>>NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl),
>>-                         ovsFwdCtx->completionList,
>>-                         &ovsFwdCtx->layers, FALSE);
>>+    if (newNbl) {
>>+        /* Decapsulated packet is in a new NBL */
>>+        ovsFwdCtx->tunnelRxNic = tunnelRxVport;
>>+        OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext,
>>+                             newNbl, tunnelRxVport->portNo, 0,
>>+
>>NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl),
>>+                             ovsFwdCtx->completionList,
>>+                             &ovsFwdCtx->layers, FALSE);
>>
>>-    /*
>>-     * Set the NBL's SourcePortId and SourceNicIndex to default values to
>>-     * keep NDIS happy when we forward the packet.
>>-     */
>>-    ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID;
>>-    ovsFwdCtx->fwdDetail->SourceNicIndex = 0;
>>+        /*
>>+         * Set the NBL's SourcePortId and SourceNicIndex to default
>>values to
>>+         * keep NDIS happy when we forward the packet.
>>+         */
>>+        ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID;
>>+        ovsFwdCtx->fwdDetail->SourceNicIndex = 0;
>>
>>-    status = OvsDoFlowLookupOutput(ovsFwdCtx);
>>+        status = OvsDoFlowLookupOutput(ovsFwdCtx);
>>+    }
>>     ASSERT(ovsFwdCtx->curNbl == NULL);
>>     OvsClearTunRxCtx(ovsFwdCtx);
>>
>>diff --git a/datapath-windows/ovsext/Stt.c b/datapath-windows/ovsext/Stt.c
>>index b78ef95..ef44d23 100644
>>--- a/datapath-windows/ovsext/Stt.c
>>+++ b/datapath-windows/ovsext/Stt.c
>>@@ -34,6 +34,7 @@
>> #endif
>> #define OVS_DBG_MOD OVS_DBG_STT
>> #include "Debug.h"
>>+#include "Jhash.h"
>>
>> KSTART_ROUTINE OvsSttDefragCleaner;
>> static PLIST_ENTRY OvsSttPktFragHash;
>>@@ -152,8 +153,8 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>>     UINT32 headRoom = OvsGetSttTunHdrSize();
>>     UINT32 tcpChksumLen;
>>     PUINT8 bufferStart;
>>-
>>-    UNREFERENCED_PARAMETER(layers);
>>+    ULONG mss = 0;
>>+    NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
>>
>>     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>>
>>@@ -162,14 +163,20 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>>     BOOLEAN innerPartialChecksum = FALSE;
>>
>>     if (layers->isTcp) {
>>-        NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
>>-
>>         lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
>>                 TcpLargeSendNetBufferListInfo);
>>-        if (lsoInfo.LsoV1Transmit.MSS) {
>>-            /* XXX We don't handle LSO yet */
>>-            OVS_LOG_ERROR("LSO on STT is not supported");
>>-            return NDIS_STATUS_FAILURE;
>>+
>>+        switch (lsoInfo.Transmit.Type) {
>>+            case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE:
>>+                mss = lsoInfo.LsoV1Transmit.MSS;
>>+                break;
>>+            case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE:
>>+                mss = lsoInfo.LsoV2Transmit.MSS;
>>+                break;
>>+            default:
>>+                OVS_LOG_ERROR("Unknown LSO transmit type:%d",
>>+                              lsoInfo.Transmit.Type);
>>+                return NDIS_STATUS_FAILURE;
>>         }
>>     }
>>
>>@@ -186,21 +193,36 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>>         return NDIS_STATUS_FAILURE;
>>     }
>>
>>-    curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
>>+    curNbl = *newNbl;
>>+    curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>>     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
>>+    /* NB Chain should be split before */
>>+    ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
>>+    innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
>>+
>>     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl,
>>                                                        LowPagePriority);
>>     bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
>>
>>-    if (layers->isIPv4 && csumInfo.Transmit.IpHeaderChecksum) {
>>+    if (layers->isIPv4) {
>>         IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
>>-        ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
>>+        if (!ip->tot_len) {
>>+            ip->tot_len = htons(innerFrameLen - sizeof(EthHdr));
>>+        }
>>+        if (!ip->check) {
>>+            ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
>>+        }
>>     }
>>+
>>     if (layers->isTcp) {
>>-        if(!csumInfo.Transmit.TcpChecksum) {
>>-            innerChecksumVerified = TRUE;
>>-        } else {
>>+        if (mss) {
>>             innerPartialChecksum = TRUE;
>>+        } else {
>>+            if (!csumInfo.Transmit.TcpChecksum) {
>>+                innerChecksumVerified = TRUE;
>>+            } else {
>>+                innerPartialChecksum = TRUE;
>>+            }
>>         }
>>     } else if (layers->isUdp) {
>>         if(!csumInfo.Transmit.UdpChecksum) {
>>@@ -210,24 +232,6 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>>         }
>>     }
>>
>>-    curNbl = *newNbl;
>>-    curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>>-    /* NB Chain should be split before */
>>-    ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
>>-
>>-    innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
>>-    /*
>>-     * External port can't be removed as we hold the dispatch lock
>>-     * We also check if the external port was removed beforecalling
>>-     * port encapsulation functions
>>-     */
>>-    if (innerFrameLen > OvsGetExternalMtu(switchContext) - headRoom) {
>>-        OVS_LOG_ERROR("Packet too large (size %d, mtu %d). Can't
>>encapsulate",
>>-                innerFrameLen, OvsGetExternalMtu(switchContext));
>>-        status = NDIS_STATUS_FAILURE;
>>-        goto ret_error;
>>-    }
>>-
>>     status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
>>     if (status != NDIS_STATUS_SUCCESS) {
>>         ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)");
>>@@ -301,33 +305,52 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport,
>>                                           IPPROTO_TCP, (uint16)
>>tcpChksumLen);
>>     sttHdr->version = 0;
>>
>>-    /* XXX need to peek into the inner packet, hard code for now */
>>-    sttHdr->flags = STT_PROTO_IPV4;
>>-    if (innerChecksumVerified) {
>>-        sttHdr->flags |= STT_CSUM_VERIFIED;
>>-    } else if (innerPartialChecksum) {
>>+    /* Set STT Header */
>>+    sttHdr->flags = 0;
>>+    if (innerPartialChecksum) {
>>         sttHdr->flags |= STT_CSUM_PARTIAL;
>>+        if (layers->isIPv4) {
>>+            sttHdr->flags |= STT_PROTO_IPV4;
>>+        }
>>+        if (layers->isTcp) {
>>+            sttHdr->flags |= STT_PROTO_TCP;
>>+        }
>>+        sttHdr->l4Offset = (UINT8) layers->l4Offset;
>>+        sttHdr->mss = (UINT16) htons(mss);
>>+    } else if (innerChecksumVerified) {
>>+        sttHdr->flags = STT_CSUM_VERIFIED;
>>+        sttHdr->l4Offset = 0;
>>+        sttHdr->mss = 0;
>>     }
>>-    sttHdr->l4Offset = 0;
>>
>>     sttHdr->reserved = 0;
>>-    /* XXX Used for large TCP packets.Not sure how it is used, clarify */
>>-    sttHdr->mss = 0;
>>     sttHdr->vlanTCI = 0;
>>     sttHdr->key = tunKey->tunnelId;
>>     /* Zero out stt padding */
>>     *(uint16 *)(sttHdr + 1) = 0;
>>
>>     /* Offload IP and TCP checksum */
>>+    ULONG tcpHeaderOffset = sizeof *outerEthHdr +
>>+                        outerIpHdr->ihl * 4;
>>     csumInfo.Value = 0;
>>     csumInfo.Transmit.IpHeaderChecksum = 1;
>>     csumInfo.Transmit.TcpChecksum = 1;
>>     csumInfo.Transmit.IsIPv4 = 1;
>>-    csumInfo.Transmit.TcpHeaderOffset = sizeof *outerEthHdr +
>>-                                        outerIpHdr->ihl * 4;
>>+    csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset;
>>     NET_BUFFER_LIST_INFO(curNbl,
>>                          TcpIpChecksumNetBufferListInfo) =
>>csumInfo.Value;
>>
>>+    UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) -
>>sizeof(TCPHdr);
>>+    if (ipTotalLen > encapMss) {
>>+        lsoInfo.Value = 0;
>>+        lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset;
>>+        lsoInfo.LsoV2Transmit.MSS = encapMss;
>>+        lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
>>+        lsoInfo.LsoV2Transmit.IPVersion =
>>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
>>+        NET_BUFFER_LIST_INFO(curNbl,
>>+                             TcpLargeSendNetBufferListInfo) =
>>lsoInfo.Value;
>>+    }
>>+
>>     return STATUS_SUCCESS;
>>
>> ret_error:
>>@@ -338,16 +361,22 @@ ret_error:
>>
>> /*
>>
>>*-------------------------------------------------------------------------
>>---
>>- * OvsCalculateTCPChecksum
>>- *     Calculate TCP checksum
>>+ * OvsValidateTCPChecksum
>>+ *     Validate TCP checksum
>>
>>*-------------------------------------------------------------------------
>>---
>>  */
>> static __inline NDIS_STATUS
>>-OvsCalculateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
>>+OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
>> {
>>     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
>>     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
>>
>>TcpIpChecksumNetBufferListInfo);
>>+
>>+    /* Check if NIC has indicated TCP checksum failure */
>>+    if (csumInfo.Receive.TcpChecksumFailed) {
>>+        return NDIS_STATUS_INVALID_PACKET;
>>+    }
>>+
>>     UINT16 checkSum;
>>
>>     /* Check if TCP Checksum has been calculated by NIC */
>>@@ -399,10 +428,9 @@ OvsInitSttDefragmentation()
>>     NdisAllocateSpinLock(&OvsSttSpinLock);
>>
>>     /* Init the Hash Buffer */
>>-    OvsSttPktFragHash = (PLIST_ENTRY) OvsAllocateMemoryWithTag(
>>-                                                sizeof(LIST_ENTRY)
>>-                                                * STT_HASH_TABLE_SIZE,
>>-                                                OVS_STT_POOL_TAG);
>>+    OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY)
>>+                                                 * STT_HASH_TABLE_SIZE,
>>+                                                 OVS_STT_POOL_TAG);
>>     if (OvsSttPktFragHash == NULL) {
>>         NdisFreeSpinLock(&OvsSttSpinLock);
>>         return STATUS_INSUFFICIENT_RESOURCES;
>>@@ -487,6 +515,7 @@ OvsSttDefragCleaner(PVOID data)
>>                 entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
>>                 if (entry->timeout < currentTime) {
>>                     RemoveEntryList(&entry->link);
>>+                    OvsFreeMemoryWithTag(entry->packetBuf,
>>OVS_STT_POOL_TAG);
>>                     OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG);
>>                 }
>>             }
>>@@ -500,6 +529,158 @@ OvsSttDefragCleaner(PVOID data)
>>     PsTerminateSystemThread(STATUS_SUCCESS);
>> }
>>
>>+static OVS_STT_PKT_KEY
>>+OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr)
>>+{
>>+    OVS_STT_PKT_KEY key;
>>+    key.sAddr = ipHdr->saddr;
>>+    key.dAddr = ipHdr->daddr;
>>+    key.ackSeq = ntohl(tcpHdr->ack_seq);
>>+    return key;
>>+}
>>+
>>+static UINT32
>>+OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey)
>>+{
>>+    UINT32 arr[3];
>>+    arr[0] = pktKey->ackSeq;
>>+    arr[1] = pktKey->dAddr;
>>+    arr[2] = pktKey->sAddr;
>>+    return OvsJhashWords(arr, 3, OVS_HASH_BASIS);
>>+}
>>+
>>+static VOID *
>>+OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash)
>>+{
>>+    PLIST_ENTRY link;
>>+    POVS_STT_PKT_ENTRY entry;
>>+
>>+    LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) {
>>+        entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
>>+        if (entry->ovsPktKey.ackSeq == pktKey->ackSeq &&
>>+            entry->ovsPktKey.dAddr == pktKey->dAddr &&
>>+            entry->ovsPktKey.sAddr == pktKey->sAddr) {
>>+            return entry;
>>+        }
>>+    }
>>+    return NULL;
>>+}
>>+
>>+/*
>>+*
>>+-------------------------------------------------------------------------
>>-
>>+* OvsSttReassemble --
>>+*     Reassemble an LSO packet from multiple STT-Fragments.
>>+*
>>+-------------------------------------------------------------------------
>>-
>>+*/
>>+PNET_BUFFER_LIST
>>+OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext,
>>+                 PNET_BUFFER_LIST curNbl,
>>+                 IPHdr *ipHdr,
>>+                 TCPHdr *tcp,
>>+                 SttHdr *newSttHdr,
>>+                 UINT16 payloadLen)
>>+{
>>+    UINT32 seq = ntohl(tcp->seq);
>>+    UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN;
>>+    UINT32 segOffset = STT_SEGMENT_OFF(seq);
>>+    UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN;
>>+    UINT32 startOffset = 0;
>>+    OVS_STT_PKT_ENTRY *pktFragEntry;
>>+    PNET_BUFFER_LIST targetPNbl = NULL;
>>+    BOOLEAN lastPacket = FALSE;
>>+    PNET_BUFFER sourceNb;
>>+    UINT32 fragmentLength = payloadLen;
>>+    SttHdr stt;
>>+    SttHdr *sttHdr = NULL;
>>+    sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>>+
>>+    /* XXX optimize this lock */
>>+    NdisAcquireSpinLock(&OvsSttSpinLock);
>>+
>>+    /* If this is the first fragment, copy the STT header */
>>+    if (segOffset == 0) {
>>+        sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0);
>>+        if (sttHdr == NULL) {
>>+            OVS_LOG_ERROR("Unable to retrieve STT header");
>>+            return NULL;
>>+        }
>>+        fragmentLength = fragmentLength - STT_HDR_LEN;
>>+        startOffset = startOffset + STT_HDR_LEN;
>>+    }
>>+
>>+    /* Lookup fragment */
>>+    OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp);
>>+    UINT32 hash = OvsSttGetPktHash(&pktKey);
>>+    pktFragEntry = OvsLookupPktFrag(&pktKey, hash);
>>+
>>+    if (pktFragEntry == NULL) {
>>+        /* Create a new Packet Entry */
>>+        POVS_STT_PKT_ENTRY entry;
>>+        entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY),
>>+                                         OVS_STT_POOL_TAG);
>>+        RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY));
>>+
>>+        /* Update Key, timestamp and recvdLen */
>>+        NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof
>>(OVS_STT_PKT_KEY));
>>+
>>+        entry->recvdLen = fragmentLength;
>>+
>>+        UINT64 currentTime;
>>+        NdisGetCurrentSystemTime((LARGE_INTEGER *) &currentTime);
>>+        entry->timeout = currentTime + STT_ENTRY_TIMEOUT;
>>+
>>+        if (segOffset == 0) {
>>+            entry->sttHdr = *sttHdr;
>>+        }
>>+
>>+        /* Copy the data from Source to new buffer */
>>+        entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen,
>>+                                                    OVS_STT_POOL_TAG);
>>+        if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
>>+                              entry->packetBuf + offset) == NULL) {
>>+            OVS_LOG_ERROR("Error when obtaining bytes from Packet");
>>+            goto handle_error;
>>+        }
>>+
>>+        /* Insert the entry in the Static Buffer */
>>+        InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK],
>>+                       &entry->link);
>>+    } else {
>>+        /* Add to recieved length to identify if this is the last
>>fragment */
>>+        pktFragEntry->recvdLen += fragmentLength;
>>+        lastPacket = (pktFragEntry->recvdLen == innerPacketLen);
>>+
>>+        if (segOffset == 0) {
>>+            pktFragEntry->sttHdr = *sttHdr;
>>+        }
>>+
>>+        /* Copy the fragment data from Source to existing buffer */
>>+        if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
>>+                              pktFragEntry->packetBuf + offset) == NULL)
>>{
>>+            OVS_LOG_ERROR("Error when obtaining bytes from Packet");
>>+            goto handle_error;
>>+        }
>>+    }
>>+
>>+handle_error:
>>+    if (lastPacket) {
>>+        /* Retrieve the original STT header */
>>+        NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof
>>(SttHdr));
>>+        targetPNbl = OvsAllocateNBLFromBuffer(switchContext,
>>pktFragEntry->packetBuf,
>>+                                              innerPacketLen);
>>+
>>+        /* Delete this entry and free up the memory/ */
>>+        RemoveEntryList(&pktFragEntry->link);
>>+        OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG);
>>+        OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG);
>>+    }
>>+
>>+    NdisReleaseSpinLock(&OvsSttSpinLock);
>>+    return lastPacket ? targetPNbl : NULL;
>>+}
>>+
>> /*
>>  *
>>--------------------------------------------------------------------------
>>  * OvsDecapStt --
>>@@ -513,34 +694,20 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>>             PNET_BUFFER_LIST *newNbl)
>> {
>>     NDIS_STATUS status = NDIS_STATUS_FAILURE;
>>-    PNET_BUFFER curNb;
>>+    PNET_BUFFER curNb, newNb;
>>     IPHdr *ipHdr;
>>     char *ipBuf[sizeof(IPHdr)];
>>+    SttHdr stt;
>>     SttHdr *sttHdr;
>>     char *sttBuf[STT_HDR_LEN];
>>     UINT32 advanceCnt, hdrLen;
>>-    NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
>>+    BOOLEAN isLsoPacket = FALSE;
>>
>>     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
>>     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
>>
>>-    if (NET_BUFFER_DATA_LENGTH(curNb) < OvsGetSttTunHdrSize()) {
>>-        OVS_LOG_ERROR("Packet length received is less than the tunnel
>>header:"
>>-            " %d<%d\n", NET_BUFFER_DATA_LENGTH(curNb),
>>OvsGetSttTunHdrSize());
>>-        return NDIS_STATUS_INVALID_LENGTH;
>>-    }
>>-
>>-    /* Verify outer TCP Checksum */
>>-    csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
>>-
>>TcpIpChecksumNetBufferListInfo);
>>-
>>-    /* Check if NIC has indicated TCP checksum failure */
>>-    if (csumInfo.Receive.TcpChecksumFailed) {
>>-        return NDIS_STATUS_INVALID_PACKET;
>>-    }
>>-
>>-    /* Calculate the TCP Checksum */
>>-    status = OvsCalculateTCPChecksum(curNbl, curNb);
>>+    /* Validate the TCP Checksum */
>>+    status = OvsValidateTCPChecksum(curNbl, curNb);
>>     if (status != NDIS_STATUS_SUCCESS) {
>>         return status;
>>     }
>>@@ -554,34 +721,73 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>>                                                     1 /*no align*/, 0);
>>     ASSERT(ipHdr);
>>
>>+    TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4);
>>+
>>     /* Skip IP & TCP headers */
>>     hdrLen = sizeof(IPHdr) + sizeof(TCPHdr),
>>     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
>>     advanceCnt += hdrLen;
>>
>>-    /* STT Header */
>>-    sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, (PVOID) &sttBuf,
>>-                                                    1 /*no align*/, 0);
>>+    UINT32 seq = ntohl(tcp->seq);
>>+    UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT);
>>+    UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len)
>>+                        - (ipHdr->ihl * 4)
>>+                        - (sizeof * tcp);
>>+
>>+    /* Check if incoming packet requires reassembly */
>>+    if (totalLen != payloadLen) {
>>+        sttHdr = &stt;
>>+        PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl,
>>+                                                 ipHdr, tcp, sttHdr,
>>+                                                 payloadLen);
>>+        if (pNbl == NULL) {
>>+            return NDIS_STATUS_SUCCESS;
>>+        }
>>+
>>+        *newNbl = pNbl;
>>+        isLsoPacket = TRUE;
>>+    } else {
>>+        /* STT Header */
>>+        sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr,
>>+                                   (PVOID) &sttBuf, 1 /*no align*/, 0);
>>+        /* Skip stt header, DataOffset points to inner pkt now. */
>>+        hdrLen = STT_HDR_LEN;
>>+        NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
>>+        advanceCnt += hdrLen;
>>+
>>+        *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0,
>>+                                    0, FALSE /*copy NBL info*/);
>>+    }
>>+
>>+    if (*newNbl == NULL) {
>>+        OVS_LOG_ERROR("Unable to allocate a new cloned NBL");
>>+        return NDIS_STATUS_RESOURCES;
>>+    }
>>+
>>+    status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
>>+    if (status != NDIS_STATUS_SUCCESS) {
>>+        OvsCompleteNBL(switchContext, *newNbl, TRUE);
>>+        return NDIS_STATUS_FAILURE;
>>+    }
>>+    newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
>>+
>>     ASSERT(sttHdr);
>>
>>     /* Initialize the tunnel key */
>>     tunKey->dst = ipHdr->daddr;
>>     tunKey->src = ipHdr->saddr;
>>     tunKey->tunnelId = sttHdr->key;
>>-    tunKey->flags = (OVS_TNL_F_CSUM | OVS_TNL_F_KEY);
>>+    tunKey->flags = OVS_TNL_F_KEY;
>>     tunKey->tos = ipHdr->tos;
>>     tunKey->ttl = ipHdr->ttl;
>>     tunKey->pad = 0;
>>
>>-    /* Skip stt header, DataOffset points to inner pkt now. */
>>-    hdrLen = STT_HDR_LEN;
>>-    NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
>>-    advanceCnt += hdrLen;
>>+    BOOLEAN requiresLSO = sttHdr->mss != 0;
>>
>>     /* Verify checksum for inner packet if it's required */
>>     if (!(sttHdr->flags & STT_CSUM_VERIFIED)) {
>>         BOOLEAN innerChecksumPartial = sttHdr->flags & STT_CSUM_PARTIAL;
>>-        EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr),
>>+        EthHdr *eth = (EthHdr *)NdisGetDataBuffer(newNb, sizeof(EthHdr),
>>                                                   NULL, 1, 0);
>>
>>         /* XXX Figure out a way to offload checksum receives */
>>@@ -597,14 +803,16 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>>                                                   IPPROTO_TCP,
>>                                                   (UINT16)l4Payload);
>>                 }
>>-                tcp->check = CalculateChecksumNB(curNb, l4Payload,
>>offset);
>>+                if (!requiresLSO) {
>>+                    tcp->check = CalculateChecksumNB(newNb, l4Payload,
>>offset);
>>+                }
>>             } else if (ip->protocol == IPPROTO_UDP) {
>>                 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
>>                 if (!innerChecksumPartial){
>>                     udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
>>                                                   IPPROTO_UDP,
>>l4Payload);
>>                 }
>>-                udp->check = CalculateChecksumNB(curNb, l4Payload,
>>offset);
>>+                udp->check = CalculateChecksumNB(newNb, l4Payload,
>>offset);
>>             }
>>         } else if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV6)) {
>>             IPv6Hdr *ip = (IPv6Hdr *)((PCHAR)eth + sizeof *eth);
>>@@ -617,7 +825,9 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>>                                                     (UINT32 *)&ip->daddr,
>>                                                     IPPROTO_TCP,
>>totalLength);
>>                 }
>>-                tcp->check = CalculateChecksumNB(curNb, totalLength,
>>offset);
>>+                if (!requiresLSO) {
>>+                    tcp->check = CalculateChecksumNB(newNb, totalLength,
>>offset);
>>+                }
>>             }
>>             else if (ip->nexthdr == IPPROTO_UDP) {
>>                 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
>>@@ -626,23 +836,27 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
>>                                                     (UINT32 *)&ip->daddr,
>>                                                     IPPROTO_UDP,
>>totalLength);
>>                 }
>>-                udp->check = CalculateChecksumNB(curNb, totalLength,
>>offset);
>>+                udp->check = CalculateChecksumNB(newNb, totalLength,
>>offset);
>>             }
>>         }
>>
>>-        NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
>>+        NET_BUFFER_LIST_INFO(*newNbl, TcpIpChecksumNetBufferListInfo) =
>>0;
>>     }
>>
>>-    *newNbl = OvsPartialCopyNBL(switchContext, curNbl,
>>OVS_DEFAULT_COPY_SIZE,
>>-                                0, FALSE /*copy NBL info*/);
>>-
>>-    ASSERT(advanceCnt == OvsGetSttTunHdrSize());
>>-    status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
>>-
>>-    if (*newNbl == NULL) {
>>-        OVS_LOG_ERROR("OvsDecapStt: Unable to allocate a new cloned
>>NBL");
>>-        status = NDIS_STATUS_RESOURCES;
>>+    if (requiresLSO) {
>>+        NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
>>+        lsoInfo.Value = 0;
>>+        lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset;
>>+        lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU - sizeof(IPHdr) -
>>sizeof(TCPHdr);
>>+        lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
>>+        if (sttHdr->flags & STT_PROTO_IPV4) {
>>+            lsoInfo.LsoV2Transmit.IPVersion =
>>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
>>+        } else {
>>+            lsoInfo.LsoV2Transmit.IPVersion =
>>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6;
>>+        }
>>+        NET_BUFFER_LIST_INFO(*newNbl,
>>+                                TcpLargeSendNetBufferListInfo) =
>>lsoInfo.Value;
>>     }
>>
>>-    return status;
>>+    return NDIS_STATUS_SUCCESS;
>> }
>>--
>>1.9.5.msysgit.0
>>
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
diff mbox

Patch

diff --git a/datapath-windows/ovsext/Actions.c b/datapath-windows/ovsext/Actions.c
index b4644a7..ce592b3 100644
--- a/datapath-windows/ovsext/Actions.c
+++ b/datapath-windows/ovsext/Actions.c
@@ -594,7 +594,7 @@  OvsDoFlowLookupOutput(OvsForwardingContext *ovsFwdCtx)
         InitializeListHead(&missedPackets);
         status = OvsCreateAndAddPackets(NULL, 0, OVS_PACKET_CMD_MISS, vport,
                           &key,ovsFwdCtx->curNbl,
-                          ovsFwdCtx->tunnelRxNic != NULL, &ovsFwdCtx->layers,
+                          FALSE, &ovsFwdCtx->layers,
                           ovsFwdCtx->switchContext, &missedPackets, &num);
         if (num) {
             OvsQueuePackets(&missedPackets, num);
@@ -709,6 +709,7 @@  OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
     NDIS_STATUS status = NDIS_STATUS_SUCCESS;
     PNET_BUFFER_LIST newNbl = NULL;
     POVS_VPORT_ENTRY tunnelRxVport = ovsFwdCtx->tunnelRxNic;
+    PCWSTR dropReason = L"OVS-dropped due to new decap packet";
 
     if (OvsValidateIPChecksum(ovsFwdCtx->curNbl, &ovsFwdCtx->layers)
             != NDIS_STATUS_SUCCESS) {
@@ -730,6 +731,10 @@  OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
     case OVS_VPORT_TYPE_STT:
         status = OvsDecapStt(ovsFwdCtx->switchContext, ovsFwdCtx->curNbl,
                              &ovsFwdCtx->tunKey, &newNbl);
+        if (status == NDIS_STATUS_SUCCESS && newNbl == NULL) {
+            /* This was an STT-LSO Fragment */
+            dropReason = L"OVS-STT segment is cached";
+        }
         break;
     default:
         OVS_LOG_ERROR("Rx: Unhandled tunnel type: %d\n",
@@ -747,25 +752,26 @@  OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx)
      * tunnelRxNic and other fields will be cleared, re-init the context
      * before usage.
       */
-    OvsCompleteNBLForwardingCtx(ovsFwdCtx,
-                                L"OVS-dropped due to new decap packet");
+    OvsCompleteNBLForwardingCtx(ovsFwdCtx, dropReason);
 
-    /* Decapsulated packet is in a new NBL */
-    ovsFwdCtx->tunnelRxNic = tunnelRxVport;
-    OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext,
-                         newNbl, tunnelRxVport->portNo, 0,
-                         NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl),
-                         ovsFwdCtx->completionList,
-                         &ovsFwdCtx->layers, FALSE);
+    if (newNbl) {
+        /* Decapsulated packet is in a new NBL */
+        ovsFwdCtx->tunnelRxNic = tunnelRxVport;
+        OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext,
+                             newNbl, tunnelRxVport->portNo, 0,
+                             NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl),
+                             ovsFwdCtx->completionList,
+                             &ovsFwdCtx->layers, FALSE);
 
-    /*
-     * Set the NBL's SourcePortId and SourceNicIndex to default values to
-     * keep NDIS happy when we forward the packet.
-     */
-    ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID;
-    ovsFwdCtx->fwdDetail->SourceNicIndex = 0;
+        /*
+         * Set the NBL's SourcePortId and SourceNicIndex to default values to
+         * keep NDIS happy when we forward the packet.
+         */
+        ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID;
+        ovsFwdCtx->fwdDetail->SourceNicIndex = 0;
 
-    status = OvsDoFlowLookupOutput(ovsFwdCtx);
+        status = OvsDoFlowLookupOutput(ovsFwdCtx);
+    }
     ASSERT(ovsFwdCtx->curNbl == NULL);
     OvsClearTunRxCtx(ovsFwdCtx);
 
diff --git a/datapath-windows/ovsext/Stt.c b/datapath-windows/ovsext/Stt.c
index b78ef95..ef44d23 100644
--- a/datapath-windows/ovsext/Stt.c
+++ b/datapath-windows/ovsext/Stt.c
@@ -34,6 +34,7 @@ 
 #endif
 #define OVS_DBG_MOD OVS_DBG_STT
 #include "Debug.h"
+#include "Jhash.h"
 
 KSTART_ROUTINE OvsSttDefragCleaner;
 static PLIST_ENTRY OvsSttPktFragHash;
@@ -152,8 +153,8 @@  OvsDoEncapStt(POVS_VPORT_ENTRY vport,
     UINT32 headRoom = OvsGetSttTunHdrSize();
     UINT32 tcpChksumLen;
     PUINT8 bufferStart;
-
-    UNREFERENCED_PARAMETER(layers);
+    ULONG mss = 0;
+    NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
 
     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
 
@@ -162,14 +163,20 @@  OvsDoEncapStt(POVS_VPORT_ENTRY vport,
     BOOLEAN innerPartialChecksum = FALSE;
 
     if (layers->isTcp) {
-        NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
-
         lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
                 TcpLargeSendNetBufferListInfo);
-        if (lsoInfo.LsoV1Transmit.MSS) {
-            /* XXX We don't handle LSO yet */
-            OVS_LOG_ERROR("LSO on STT is not supported");
-            return NDIS_STATUS_FAILURE;
+
+        switch (lsoInfo.Transmit.Type) {
+            case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE:
+                mss = lsoInfo.LsoV1Transmit.MSS;
+                break;
+            case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE:
+                mss = lsoInfo.LsoV2Transmit.MSS;
+                break;
+            default:
+                OVS_LOG_ERROR("Unknown LSO transmit type:%d",
+                              lsoInfo.Transmit.Type);
+                return NDIS_STATUS_FAILURE;
         }
     }
 
@@ -186,21 +193,36 @@  OvsDoEncapStt(POVS_VPORT_ENTRY vport,
         return NDIS_STATUS_FAILURE;
     }
 
-    curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
+    curNbl = *newNbl;
+    curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
     curMdl = NET_BUFFER_CURRENT_MDL(curNb);
+    /* NB Chain should be split before */
+    ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
+    innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
+
     bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl,
                                                        LowPagePriority);
     bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb);
 
-    if (layers->isIPv4 && csumInfo.Transmit.IpHeaderChecksum) {
+    if (layers->isIPv4) {
         IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset);
-        ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
+        if (!ip->tot_len) {
+            ip->tot_len = htons(innerFrameLen - sizeof(EthHdr));
+        }
+        if (!ip->check) {
+            ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0);
+        }
     }
+
     if (layers->isTcp) {
-        if(!csumInfo.Transmit.TcpChecksum) {
-            innerChecksumVerified = TRUE;
-        } else {
+        if (mss) {
             innerPartialChecksum = TRUE;
+        } else {
+            if (!csumInfo.Transmit.TcpChecksum) {
+                innerChecksumVerified = TRUE;
+            } else {
+                innerPartialChecksum = TRUE;
+            }
         }
     } else if (layers->isUdp) {
         if(!csumInfo.Transmit.UdpChecksum) {
@@ -210,24 +232,6 @@  OvsDoEncapStt(POVS_VPORT_ENTRY vport,
         }
     }
 
-    curNbl = *newNbl;
-    curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
-    /* NB Chain should be split before */
-    ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
-
-    innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb);
-    /*
-     * External port can't be removed as we hold the dispatch lock
-     * We also check if the external port was removed beforecalling
-     * port encapsulation functions
-     */
-    if (innerFrameLen > OvsGetExternalMtu(switchContext) - headRoom) {
-        OVS_LOG_ERROR("Packet too large (size %d, mtu %d). Can't encapsulate",
-                innerFrameLen, OvsGetExternalMtu(switchContext));
-        status = NDIS_STATUS_FAILURE;
-        goto ret_error;
-    }
-
     status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL);
     if (status != NDIS_STATUS_SUCCESS) {
         ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)");
@@ -301,33 +305,52 @@  OvsDoEncapStt(POVS_VPORT_ENTRY vport,
                                           IPPROTO_TCP, (uint16) tcpChksumLen);
     sttHdr->version = 0;
 
-    /* XXX need to peek into the inner packet, hard code for now */
-    sttHdr->flags = STT_PROTO_IPV4;
-    if (innerChecksumVerified) {
-        sttHdr->flags |= STT_CSUM_VERIFIED;
-    } else if (innerPartialChecksum) {
+    /* Set STT Header */
+    sttHdr->flags = 0;
+    if (innerPartialChecksum) {
         sttHdr->flags |= STT_CSUM_PARTIAL;
+        if (layers->isIPv4) {
+            sttHdr->flags |= STT_PROTO_IPV4;
+        }
+        if (layers->isTcp) {
+            sttHdr->flags |= STT_PROTO_TCP;
+        }
+        sttHdr->l4Offset = (UINT8) layers->l4Offset;
+        sttHdr->mss = (UINT16) htons(mss);
+    } else if (innerChecksumVerified) {
+        sttHdr->flags = STT_CSUM_VERIFIED;
+        sttHdr->l4Offset = 0;
+        sttHdr->mss = 0;
     }
-    sttHdr->l4Offset = 0;
 
     sttHdr->reserved = 0;
-    /* XXX Used for large TCP packets.Not sure how it is used, clarify */
-    sttHdr->mss = 0;
     sttHdr->vlanTCI = 0;
     sttHdr->key = tunKey->tunnelId;
     /* Zero out stt padding */
     *(uint16 *)(sttHdr + 1) = 0;
 
     /* Offload IP and TCP checksum */
+    ULONG tcpHeaderOffset = sizeof *outerEthHdr +
+                        outerIpHdr->ihl * 4;
     csumInfo.Value = 0;
     csumInfo.Transmit.IpHeaderChecksum = 1;
     csumInfo.Transmit.TcpChecksum = 1;
     csumInfo.Transmit.IsIPv4 = 1;
-    csumInfo.Transmit.TcpHeaderOffset = sizeof *outerEthHdr +
-                                        outerIpHdr->ihl * 4;
+    csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset;
     NET_BUFFER_LIST_INFO(curNbl,
                          TcpIpChecksumNetBufferListInfo) = csumInfo.Value;
 
+    UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) - sizeof(TCPHdr);
+    if (ipTotalLen > encapMss) {
+        lsoInfo.Value = 0;
+        lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset;
+        lsoInfo.LsoV2Transmit.MSS = encapMss;
+        lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
+        lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
+        NET_BUFFER_LIST_INFO(curNbl,
+                             TcpLargeSendNetBufferListInfo) = lsoInfo.Value;
+    }
+
     return STATUS_SUCCESS;
 
 ret_error:
@@ -338,16 +361,22 @@  ret_error:
 
 /*
  *----------------------------------------------------------------------------
- * OvsCalculateTCPChecksum
- *     Calculate TCP checksum
+ * OvsValidateTCPChecksum
+ *     Validate TCP checksum
  *----------------------------------------------------------------------------
  */
 static __inline NDIS_STATUS
-OvsCalculateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
+OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb)
 {
     NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
     csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
                                           TcpIpChecksumNetBufferListInfo);
+
+    /* Check if NIC has indicated TCP checksum failure */
+    if (csumInfo.Receive.TcpChecksumFailed) {
+        return NDIS_STATUS_INVALID_PACKET;
+    }
+
     UINT16 checkSum;
 
     /* Check if TCP Checksum has been calculated by NIC */
@@ -399,10 +428,9 @@  OvsInitSttDefragmentation()
     NdisAllocateSpinLock(&OvsSttSpinLock);
 
     /* Init the Hash Buffer */
-    OvsSttPktFragHash = (PLIST_ENTRY) OvsAllocateMemoryWithTag(
-                                                sizeof(LIST_ENTRY)
-                                                * STT_HASH_TABLE_SIZE,
-                                                OVS_STT_POOL_TAG);
+    OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY)
+                                                 * STT_HASH_TABLE_SIZE,
+                                                 OVS_STT_POOL_TAG);
     if (OvsSttPktFragHash == NULL) {
         NdisFreeSpinLock(&OvsSttSpinLock);
         return STATUS_INSUFFICIENT_RESOURCES;
@@ -487,6 +515,7 @@  OvsSttDefragCleaner(PVOID data)
                 entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
                 if (entry->timeout < currentTime) {
                     RemoveEntryList(&entry->link);
+                    OvsFreeMemoryWithTag(entry->packetBuf, OVS_STT_POOL_TAG);
                     OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG);
                 }
             }
@@ -500,6 +529,158 @@  OvsSttDefragCleaner(PVOID data)
     PsTerminateSystemThread(STATUS_SUCCESS);
 }
 
+static OVS_STT_PKT_KEY
+OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr)
+{
+    OVS_STT_PKT_KEY key;
+    key.sAddr = ipHdr->saddr;
+    key.dAddr = ipHdr->daddr;
+    key.ackSeq = ntohl(tcpHdr->ack_seq);
+    return key;
+}
+
+static UINT32
+OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey)
+{
+    UINT32 arr[3];
+    arr[0] = pktKey->ackSeq;
+    arr[1] = pktKey->dAddr;
+    arr[2] = pktKey->sAddr;
+    return OvsJhashWords(arr, 3, OVS_HASH_BASIS);
+}
+
+static VOID *
+OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash)
+{
+    PLIST_ENTRY link;
+    POVS_STT_PKT_ENTRY entry;
+
+    LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) {
+        entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link);
+        if (entry->ovsPktKey.ackSeq == pktKey->ackSeq &&
+            entry->ovsPktKey.dAddr == pktKey->dAddr &&
+            entry->ovsPktKey.sAddr == pktKey->sAddr) {
+            return entry;
+        }
+    }
+    return NULL;
+}
+
+/*
+*
+--------------------------------------------------------------------------
+* OvsSttReassemble --
+*     Reassemble an LSO packet from multiple STT-Fragments.
+*
+--------------------------------------------------------------------------
+*/
+PNET_BUFFER_LIST
+OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext,
+                 PNET_BUFFER_LIST curNbl,
+                 IPHdr *ipHdr,
+                 TCPHdr *tcp,
+                 SttHdr *newSttHdr,
+                 UINT16 payloadLen)
+{
+    UINT32 seq = ntohl(tcp->seq);
+    UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN;
+    UINT32 segOffset = STT_SEGMENT_OFF(seq);
+    UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN;
+    UINT32 startOffset = 0;
+    OVS_STT_PKT_ENTRY *pktFragEntry;
+    PNET_BUFFER_LIST targetPNbl = NULL;
+    BOOLEAN lastPacket = FALSE;
+    PNET_BUFFER sourceNb;
+    UINT32 fragmentLength = payloadLen;
+    SttHdr stt;
+    SttHdr *sttHdr = NULL;
+    sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
+
+    /* XXX optimize this lock */
+    NdisAcquireSpinLock(&OvsSttSpinLock);
+
+    /* If this is the first fragment, copy the STT header */
+    if (segOffset == 0) {
+        sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0);
+        if (sttHdr == NULL) {
+            OVS_LOG_ERROR("Unable to retrieve STT header");
+            return NULL;
+        }
+        fragmentLength = fragmentLength - STT_HDR_LEN;
+        startOffset = startOffset + STT_HDR_LEN;
+    }
+
+    /* Lookup fragment */
+    OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp);
+    UINT32 hash = OvsSttGetPktHash(&pktKey);
+    pktFragEntry = OvsLookupPktFrag(&pktKey, hash);
+
+    if (pktFragEntry == NULL) {
+        /* Create a new Packet Entry */
+        POVS_STT_PKT_ENTRY entry;
+        entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY),
+                                         OVS_STT_POOL_TAG);
+        RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY));
+
+        /* Update Key, timestamp and recvdLen */
+        NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof (OVS_STT_PKT_KEY));
+
+        entry->recvdLen = fragmentLength;
+
+        UINT64 currentTime;
+        NdisGetCurrentSystemTime((LARGE_INTEGER *) &currentTime);
+        entry->timeout = currentTime + STT_ENTRY_TIMEOUT;
+
+        if (segOffset == 0) {
+            entry->sttHdr = *sttHdr;
+        }
+
+        /* Copy the data from Source to new buffer */
+        entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen,
+                                                    OVS_STT_POOL_TAG);
+        if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
+                              entry->packetBuf + offset) == NULL) {
+            OVS_LOG_ERROR("Error when obtaining bytes from Packet");
+            goto handle_error;
+        }
+
+        /* Insert the entry in the Static Buffer */
+        InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK],
+                       &entry->link);
+    } else {
+        /* Add to recieved length to identify if this is the last fragment */
+        pktFragEntry->recvdLen += fragmentLength;
+        lastPacket = (pktFragEntry->recvdLen == innerPacketLen);
+
+        if (segOffset == 0) {
+            pktFragEntry->sttHdr = *sttHdr;
+        }
+
+        /* Copy the fragment data from Source to existing buffer */
+        if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset,
+                              pktFragEntry->packetBuf + offset) == NULL) {
+            OVS_LOG_ERROR("Error when obtaining bytes from Packet");
+            goto handle_error;
+        }
+    }
+
+handle_error:
+    if (lastPacket) {
+        /* Retrieve the original STT header */
+        NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof (SttHdr));
+        targetPNbl = OvsAllocateNBLFromBuffer(switchContext, pktFragEntry->packetBuf,
+                                              innerPacketLen);
+
+        /* Delete this entry and free up the memory/ */
+        RemoveEntryList(&pktFragEntry->link);
+        OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG);
+        OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG);
+    }
+
+    NdisReleaseSpinLock(&OvsSttSpinLock);
+    return lastPacket ? targetPNbl : NULL;
+}
+
 /*
  * --------------------------------------------------------------------------
  * OvsDecapStt --
@@ -513,34 +694,20 @@  OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
             PNET_BUFFER_LIST *newNbl)
 {
     NDIS_STATUS status = NDIS_STATUS_FAILURE;
-    PNET_BUFFER curNb;
+    PNET_BUFFER curNb, newNb;
     IPHdr *ipHdr;
     char *ipBuf[sizeof(IPHdr)];
+    SttHdr stt;
     SttHdr *sttHdr;
     char *sttBuf[STT_HDR_LEN];
     UINT32 advanceCnt, hdrLen;
-    NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo;
+    BOOLEAN isLsoPacket = FALSE;
 
     curNb = NET_BUFFER_LIST_FIRST_NB(curNbl);
     ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL);
 
-    if (NET_BUFFER_DATA_LENGTH(curNb) < OvsGetSttTunHdrSize()) {
-        OVS_LOG_ERROR("Packet length received is less than the tunnel header:"
-            " %d<%d\n", NET_BUFFER_DATA_LENGTH(curNb), OvsGetSttTunHdrSize());
-        return NDIS_STATUS_INVALID_LENGTH;
-    }
-
-    /* Verify outer TCP Checksum */
-    csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl,
-                                          TcpIpChecksumNetBufferListInfo);
-
-    /* Check if NIC has indicated TCP checksum failure */
-    if (csumInfo.Receive.TcpChecksumFailed) {
-        return NDIS_STATUS_INVALID_PACKET;
-    }
-
-    /* Calculate the TCP Checksum */
-    status = OvsCalculateTCPChecksum(curNbl, curNb);
+    /* Validate the TCP Checksum */
+    status = OvsValidateTCPChecksum(curNbl, curNb);
     if (status != NDIS_STATUS_SUCCESS) {
         return status;
     }
@@ -554,34 +721,73 @@  OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
                                                     1 /*no align*/, 0);
     ASSERT(ipHdr);
 
+    TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4);
+
     /* Skip IP & TCP headers */
     hdrLen = sizeof(IPHdr) + sizeof(TCPHdr),
     NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
     advanceCnt += hdrLen;
 
-    /* STT Header */
-    sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, (PVOID) &sttBuf,
-                                                    1 /*no align*/, 0);
+    UINT32 seq = ntohl(tcp->seq);
+    UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT);
+    UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len)
+                        - (ipHdr->ihl * 4)
+                        - (sizeof * tcp);
+
+    /* Check if incoming packet requires reassembly */
+    if (totalLen != payloadLen) {
+        sttHdr = &stt;
+        PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl,
+                                                 ipHdr, tcp, sttHdr,
+                                                 payloadLen);
+        if (pNbl == NULL) {
+            return NDIS_STATUS_SUCCESS;
+        }
+
+        *newNbl = pNbl;
+        isLsoPacket = TRUE;
+    } else {
+        /* STT Header */
+        sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr,
+                                   (PVOID) &sttBuf, 1 /*no align*/, 0);
+        /* Skip stt header, DataOffset points to inner pkt now. */
+        hdrLen = STT_HDR_LEN;
+        NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
+        advanceCnt += hdrLen;
+
+        *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0,
+                                    0, FALSE /*copy NBL info*/);
+    }
+
+    if (*newNbl == NULL) {
+        OVS_LOG_ERROR("Unable to allocate a new cloned NBL");
+        return NDIS_STATUS_RESOURCES;
+    }
+
+    status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
+    if (status != NDIS_STATUS_SUCCESS) {
+        OvsCompleteNBL(switchContext, *newNbl, TRUE);
+        return NDIS_STATUS_FAILURE;
+    }
+    newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl);
+
     ASSERT(sttHdr);
 
     /* Initialize the tunnel key */
     tunKey->dst = ipHdr->daddr;
     tunKey->src = ipHdr->saddr;
     tunKey->tunnelId = sttHdr->key;
-    tunKey->flags = (OVS_TNL_F_CSUM | OVS_TNL_F_KEY);
+    tunKey->flags = OVS_TNL_F_KEY;
     tunKey->tos = ipHdr->tos;
     tunKey->ttl = ipHdr->ttl;
     tunKey->pad = 0;
 
-    /* Skip stt header, DataOffset points to inner pkt now. */
-    hdrLen = STT_HDR_LEN;
-    NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL);
-    advanceCnt += hdrLen;
+    BOOLEAN requiresLSO = sttHdr->mss != 0;
 
     /* Verify checksum for inner packet if it's required */
     if (!(sttHdr->flags & STT_CSUM_VERIFIED)) {
         BOOLEAN innerChecksumPartial = sttHdr->flags & STT_CSUM_PARTIAL;
-        EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr),
+        EthHdr *eth = (EthHdr *)NdisGetDataBuffer(newNb, sizeof(EthHdr),
                                                   NULL, 1, 0);
 
         /* XXX Figure out a way to offload checksum receives */
@@ -597,14 +803,16 @@  OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
                                                   IPPROTO_TCP,
                                                   (UINT16)l4Payload);
                 }
-                tcp->check = CalculateChecksumNB(curNb, l4Payload, offset);
+                if (!requiresLSO) {
+                    tcp->check = CalculateChecksumNB(newNb, l4Payload, offset);
+                }
             } else if (ip->protocol == IPPROTO_UDP) {
                 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
                 if (!innerChecksumPartial){
                     udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr,
                                                   IPPROTO_UDP, l4Payload);
                 }
-                udp->check = CalculateChecksumNB(curNb, l4Payload, offset);
+                udp->check = CalculateChecksumNB(newNb, l4Payload, offset);
             }
         } else if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV6)) {
             IPv6Hdr *ip = (IPv6Hdr *)((PCHAR)eth + sizeof *eth);
@@ -617,7 +825,9 @@  OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
                                                     (UINT32 *)&ip->daddr,
                                                     IPPROTO_TCP, totalLength);
                 }
-                tcp->check = CalculateChecksumNB(curNb, totalLength, offset);
+                if (!requiresLSO) {
+                    tcp->check = CalculateChecksumNB(newNb, totalLength, offset);
+                }
             }
             else if (ip->nexthdr == IPPROTO_UDP) {
                 UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip);
@@ -626,23 +836,27 @@  OvsDecapStt(POVS_SWITCH_CONTEXT switchContext,
                                                     (UINT32 *)&ip->daddr,
                                                     IPPROTO_UDP, totalLength);
                 }
-                udp->check = CalculateChecksumNB(curNb, totalLength, offset);
+                udp->check = CalculateChecksumNB(newNb, totalLength, offset);
             }
         }
 
-        NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0;
+        NET_BUFFER_LIST_INFO(*newNbl, TcpIpChecksumNetBufferListInfo) = 0;
     }
 
-    *newNbl = OvsPartialCopyNBL(switchContext, curNbl, OVS_DEFAULT_COPY_SIZE,
-                                0, FALSE /*copy NBL info*/);
-
-    ASSERT(advanceCnt == OvsGetSttTunHdrSize());
-    status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL);
-
-    if (*newNbl == NULL) {
-        OVS_LOG_ERROR("OvsDecapStt: Unable to allocate a new cloned NBL");
-        status = NDIS_STATUS_RESOURCES;
+    if (requiresLSO) {
+        NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo;
+        lsoInfo.Value = 0;
+        lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset;
+        lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU - sizeof(IPHdr) - sizeof(TCPHdr);
+        lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
+        if (sttHdr->flags & STT_PROTO_IPV4) {
+            lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4;
+        } else {
+            lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6;
+        }
+        NET_BUFFER_LIST_INFO(*newNbl,
+                                TcpLargeSendNetBufferListInfo) = lsoInfo.Value;
     }
 
-    return status;
+    return NDIS_STATUS_SUCCESS;
 }