Message ID | 1445903321-7864-4-git-send-email-vsairam@vmware.com |
---|---|
State | Accepted |
Headers | show |
Thanks for working on this. It is really awesome to have TSO support with STT now! Acked-by: Nithin Raju <nithin@vmware.com> -----Original Message----- From: Sairam Venugopal <vsairam@vmware.com> Date: Tuesday, October 27, 2015 at 10:20 AM To: Nithin Raju <nithin@vmware.com> Subject: Fw: [PATCH v2 3/3] datapath-windows: STT - Enable support for TCP Segmentation offloads > >________________________________________ >From: Sairam Venugopal <vsairam@vmware.com> >Sent: Monday, October 26, 2015 4:48 PM >To: dev@openvswitch.org >Cc: Sairam Venugopal >Subject: [PATCH v2 3/3] datapath-windows: STT - Enable support for TCP >Segmentation offloads > >Add support to STT - Encap and Decap functions to reassemble the packet >fragments. Also add support to offload the packet to NDIS. > >Signed-off-by: Sairam Venugopal <vsairam@vmware.com> >--- > datapath-windows/ovsext/Actions.c | 40 ++-- > datapath-windows/ovsext/Stt.c | 398 >+++++++++++++++++++++++++++++--------- > 2 files changed, 329 insertions(+), 109 deletions(-) > >diff --git a/datapath-windows/ovsext/Actions.c >b/datapath-windows/ovsext/Actions.c >index b4644a7..ce592b3 100644 >--- a/datapath-windows/ovsext/Actions.c >+++ b/datapath-windows/ovsext/Actions.c >@@ -594,7 +594,7 @@ OvsDoFlowLookupOutput(OvsForwardingContext *ovsFwdCtx) > InitializeListHead(&missedPackets); > status = OvsCreateAndAddPackets(NULL, 0, OVS_PACKET_CMD_MISS, >vport, > &key,ovsFwdCtx->curNbl, >- ovsFwdCtx->tunnelRxNic != NULL, >&ovsFwdCtx->layers, >+ FALSE, &ovsFwdCtx->layers, > ovsFwdCtx->switchContext, &missedPackets, >&num); > if (num) { > OvsQueuePackets(&missedPackets, num); >@@ -709,6 +709,7 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) > NDIS_STATUS status = NDIS_STATUS_SUCCESS; > PNET_BUFFER_LIST newNbl = NULL; > POVS_VPORT_ENTRY tunnelRxVport = ovsFwdCtx->tunnelRxNic; >+ PCWSTR dropReason = L"OVS-dropped due to new decap packet"; > > if (OvsValidateIPChecksum(ovsFwdCtx->curNbl, &ovsFwdCtx->layers) > != NDIS_STATUS_SUCCESS) { >@@ -730,6 +731,10 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) > case OVS_VPORT_TYPE_STT: > status = OvsDecapStt(ovsFwdCtx->switchContext, ovsFwdCtx->curNbl, > &ovsFwdCtx->tunKey, &newNbl); >+ if (status == NDIS_STATUS_SUCCESS && newNbl == NULL) { >+ /* This was an STT-LSO Fragment */ >+ dropReason = L"OVS-STT segment is cached"; >+ } > break; > default: > OVS_LOG_ERROR("Rx: Unhandled tunnel type: %d\n", >@@ -747,25 +752,26 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) > * tunnelRxNic and other fields will be cleared, re-init the context > * before usage. > */ >- OvsCompleteNBLForwardingCtx(ovsFwdCtx, >- L"OVS-dropped due to new decap packet"); >+ OvsCompleteNBLForwardingCtx(ovsFwdCtx, dropReason); > >- /* Decapsulated packet is in a new NBL */ >- ovsFwdCtx->tunnelRxNic = tunnelRxVport; >- OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext, >- newNbl, tunnelRxVport->portNo, 0, >- >NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl), >- ovsFwdCtx->completionList, >- &ovsFwdCtx->layers, FALSE); >+ if (newNbl) { >+ /* Decapsulated packet is in a new NBL */ >+ ovsFwdCtx->tunnelRxNic = tunnelRxVport; >+ OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext, >+ newNbl, tunnelRxVport->portNo, 0, >+ >NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl), >+ ovsFwdCtx->completionList, >+ &ovsFwdCtx->layers, FALSE); > >- /* >- * Set the NBL's SourcePortId and SourceNicIndex to default values to >- * keep NDIS happy when we forward the packet. >- */ >- ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID; >- ovsFwdCtx->fwdDetail->SourceNicIndex = 0; >+ /* >+ * Set the NBL's SourcePortId and SourceNicIndex to default >values to >+ * keep NDIS happy when we forward the packet. >+ */ >+ ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID; >+ ovsFwdCtx->fwdDetail->SourceNicIndex = 0; > >- status = OvsDoFlowLookupOutput(ovsFwdCtx); >+ status = OvsDoFlowLookupOutput(ovsFwdCtx); >+ } > ASSERT(ovsFwdCtx->curNbl == NULL); > OvsClearTunRxCtx(ovsFwdCtx); > >diff --git a/datapath-windows/ovsext/Stt.c b/datapath-windows/ovsext/Stt.c >index b78ef95..ef44d23 100644 >--- a/datapath-windows/ovsext/Stt.c >+++ b/datapath-windows/ovsext/Stt.c >@@ -34,6 +34,7 @@ > #endif > #define OVS_DBG_MOD OVS_DBG_STT > #include "Debug.h" >+#include "Jhash.h" > > KSTART_ROUTINE OvsSttDefragCleaner; > static PLIST_ENTRY OvsSttPktFragHash; >@@ -152,8 +153,8 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > UINT32 headRoom = OvsGetSttTunHdrSize(); > UINT32 tcpChksumLen; > PUINT8 bufferStart; >- >- UNREFERENCED_PARAMETER(layers); >+ ULONG mss = 0; >+ NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; > > curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); > >@@ -162,14 +163,20 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > BOOLEAN innerPartialChecksum = FALSE; > > if (layers->isTcp) { >- NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; >- > lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl, > TcpLargeSendNetBufferListInfo); >- if (lsoInfo.LsoV1Transmit.MSS) { >- /* XXX We don't handle LSO yet */ >- OVS_LOG_ERROR("LSO on STT is not supported"); >- return NDIS_STATUS_FAILURE; >+ >+ switch (lsoInfo.Transmit.Type) { >+ case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE: >+ mss = lsoInfo.LsoV1Transmit.MSS; >+ break; >+ case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE: >+ mss = lsoInfo.LsoV2Transmit.MSS; >+ break; >+ default: >+ OVS_LOG_ERROR("Unknown LSO transmit type:%d", >+ lsoInfo.Transmit.Type); >+ return NDIS_STATUS_FAILURE; > } > } > >@@ -186,21 +193,36 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > return NDIS_STATUS_FAILURE; > } > >- curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl); >+ curNbl = *newNbl; >+ curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); > curMdl = NET_BUFFER_CURRENT_MDL(curNb); >+ /* NB Chain should be split before */ >+ ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); >+ innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); >+ > bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, > LowPagePriority); > bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); > >- if (layers->isIPv4 && csumInfo.Transmit.IpHeaderChecksum) { >+ if (layers->isIPv4) { > IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset); >- ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); >+ if (!ip->tot_len) { >+ ip->tot_len = htons(innerFrameLen - sizeof(EthHdr)); >+ } >+ if (!ip->check) { >+ ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); >+ } > } >+ > if (layers->isTcp) { >- if(!csumInfo.Transmit.TcpChecksum) { >- innerChecksumVerified = TRUE; >- } else { >+ if (mss) { > innerPartialChecksum = TRUE; >+ } else { >+ if (!csumInfo.Transmit.TcpChecksum) { >+ innerChecksumVerified = TRUE; >+ } else { >+ innerPartialChecksum = TRUE; >+ } > } > } else if (layers->isUdp) { > if(!csumInfo.Transmit.UdpChecksum) { >@@ -210,24 +232,6 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > } > } > >- curNbl = *newNbl; >- curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >- /* NB Chain should be split before */ >- ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); >- >- innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); >- /* >- * External port can't be removed as we hold the dispatch lock >- * We also check if the external port was removed beforecalling >- * port encapsulation functions >- */ >- if (innerFrameLen > OvsGetExternalMtu(switchContext) - headRoom) { >- OVS_LOG_ERROR("Packet too large (size %d, mtu %d). Can't >encapsulate", >- innerFrameLen, OvsGetExternalMtu(switchContext)); >- status = NDIS_STATUS_FAILURE; >- goto ret_error; >- } >- > status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL); > if (status != NDIS_STATUS_SUCCESS) { > ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)"); >@@ -301,33 +305,52 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, > IPPROTO_TCP, (uint16) >tcpChksumLen); > sttHdr->version = 0; > >- /* XXX need to peek into the inner packet, hard code for now */ >- sttHdr->flags = STT_PROTO_IPV4; >- if (innerChecksumVerified) { >- sttHdr->flags |= STT_CSUM_VERIFIED; >- } else if (innerPartialChecksum) { >+ /* Set STT Header */ >+ sttHdr->flags = 0; >+ if (innerPartialChecksum) { > sttHdr->flags |= STT_CSUM_PARTIAL; >+ if (layers->isIPv4) { >+ sttHdr->flags |= STT_PROTO_IPV4; >+ } >+ if (layers->isTcp) { >+ sttHdr->flags |= STT_PROTO_TCP; >+ } >+ sttHdr->l4Offset = (UINT8) layers->l4Offset; >+ sttHdr->mss = (UINT16) htons(mss); >+ } else if (innerChecksumVerified) { >+ sttHdr->flags = STT_CSUM_VERIFIED; >+ sttHdr->l4Offset = 0; >+ sttHdr->mss = 0; > } >- sttHdr->l4Offset = 0; > > sttHdr->reserved = 0; >- /* XXX Used for large TCP packets.Not sure how it is used, clarify */ >- sttHdr->mss = 0; > sttHdr->vlanTCI = 0; > sttHdr->key = tunKey->tunnelId; > /* Zero out stt padding */ > *(uint16 *)(sttHdr + 1) = 0; > > /* Offload IP and TCP checksum */ >+ ULONG tcpHeaderOffset = sizeof *outerEthHdr + >+ outerIpHdr->ihl * 4; > csumInfo.Value = 0; > csumInfo.Transmit.IpHeaderChecksum = 1; > csumInfo.Transmit.TcpChecksum = 1; > csumInfo.Transmit.IsIPv4 = 1; >- csumInfo.Transmit.TcpHeaderOffset = sizeof *outerEthHdr + >- outerIpHdr->ihl * 4; >+ csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset; > NET_BUFFER_LIST_INFO(curNbl, > TcpIpChecksumNetBufferListInfo) = >csumInfo.Value; > >+ UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) - >sizeof(TCPHdr); >+ if (ipTotalLen > encapMss) { >+ lsoInfo.Value = 0; >+ lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset; >+ lsoInfo.LsoV2Transmit.MSS = encapMss; >+ lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; >+ lsoInfo.LsoV2Transmit.IPVersion = >NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; >+ NET_BUFFER_LIST_INFO(curNbl, >+ TcpLargeSendNetBufferListInfo) = >lsoInfo.Value; >+ } >+ > return STATUS_SUCCESS; > > ret_error: >@@ -338,16 +361,22 @@ ret_error: > > /* > >*------------------------------------------------------------------------- >--- >- * OvsCalculateTCPChecksum >- * Calculate TCP checksum >+ * OvsValidateTCPChecksum >+ * Validate TCP checksum > >*------------------------------------------------------------------------- >--- > */ > static __inline NDIS_STATUS >-OvsCalculateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb) >+OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb) > { > NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; > csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, > >TcpIpChecksumNetBufferListInfo); >+ >+ /* Check if NIC has indicated TCP checksum failure */ >+ if (csumInfo.Receive.TcpChecksumFailed) { >+ return NDIS_STATUS_INVALID_PACKET; >+ } >+ > UINT16 checkSum; > > /* Check if TCP Checksum has been calculated by NIC */ >@@ -399,10 +428,9 @@ OvsInitSttDefragmentation() > NdisAllocateSpinLock(&OvsSttSpinLock); > > /* Init the Hash Buffer */ >- OvsSttPktFragHash = (PLIST_ENTRY) OvsAllocateMemoryWithTag( >- sizeof(LIST_ENTRY) >- * STT_HASH_TABLE_SIZE, >- OVS_STT_POOL_TAG); >+ OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY) >+ * STT_HASH_TABLE_SIZE, >+ OVS_STT_POOL_TAG); > if (OvsSttPktFragHash == NULL) { > NdisFreeSpinLock(&OvsSttSpinLock); > return STATUS_INSUFFICIENT_RESOURCES; >@@ -487,6 +515,7 @@ OvsSttDefragCleaner(PVOID data) > entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link); > if (entry->timeout < currentTime) { > RemoveEntryList(&entry->link); >+ OvsFreeMemoryWithTag(entry->packetBuf, >OVS_STT_POOL_TAG); > OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG); > } > } >@@ -500,6 +529,158 @@ OvsSttDefragCleaner(PVOID data) > PsTerminateSystemThread(STATUS_SUCCESS); > } > >+static OVS_STT_PKT_KEY >+OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr) >+{ >+ OVS_STT_PKT_KEY key; >+ key.sAddr = ipHdr->saddr; >+ key.dAddr = ipHdr->daddr; >+ key.ackSeq = ntohl(tcpHdr->ack_seq); >+ return key; >+} >+ >+static UINT32 >+OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey) >+{ >+ UINT32 arr[3]; >+ arr[0] = pktKey->ackSeq; >+ arr[1] = pktKey->dAddr; >+ arr[2] = pktKey->sAddr; >+ return OvsJhashWords(arr, 3, OVS_HASH_BASIS); >+} >+ >+static VOID * >+OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash) >+{ >+ PLIST_ENTRY link; >+ POVS_STT_PKT_ENTRY entry; >+ >+ LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) { >+ entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link); >+ if (entry->ovsPktKey.ackSeq == pktKey->ackSeq && >+ entry->ovsPktKey.dAddr == pktKey->dAddr && >+ entry->ovsPktKey.sAddr == pktKey->sAddr) { >+ return entry; >+ } >+ } >+ return NULL; >+} >+ >+/* >+* >+------------------------------------------------------------------------- >- >+* OvsSttReassemble -- >+* Reassemble an LSO packet from multiple STT-Fragments. >+* >+------------------------------------------------------------------------- >- >+*/ >+PNET_BUFFER_LIST >+OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext, >+ PNET_BUFFER_LIST curNbl, >+ IPHdr *ipHdr, >+ TCPHdr *tcp, >+ SttHdr *newSttHdr, >+ UINT16 payloadLen) >+{ >+ UINT32 seq = ntohl(tcp->seq); >+ UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN; >+ UINT32 segOffset = STT_SEGMENT_OFF(seq); >+ UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN; >+ UINT32 startOffset = 0; >+ OVS_STT_PKT_ENTRY *pktFragEntry; >+ PNET_BUFFER_LIST targetPNbl = NULL; >+ BOOLEAN lastPacket = FALSE; >+ PNET_BUFFER sourceNb; >+ UINT32 fragmentLength = payloadLen; >+ SttHdr stt; >+ SttHdr *sttHdr = NULL; >+ sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >+ >+ /* XXX optimize this lock */ >+ NdisAcquireSpinLock(&OvsSttSpinLock); >+ >+ /* If this is the first fragment, copy the STT header */ >+ if (segOffset == 0) { >+ sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0); >+ if (sttHdr == NULL) { >+ OVS_LOG_ERROR("Unable to retrieve STT header"); >+ return NULL; >+ } >+ fragmentLength = fragmentLength - STT_HDR_LEN; >+ startOffset = startOffset + STT_HDR_LEN; >+ } >+ >+ /* Lookup fragment */ >+ OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp); >+ UINT32 hash = OvsSttGetPktHash(&pktKey); >+ pktFragEntry = OvsLookupPktFrag(&pktKey, hash); >+ >+ if (pktFragEntry == NULL) { >+ /* Create a new Packet Entry */ >+ POVS_STT_PKT_ENTRY entry; >+ entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY), >+ OVS_STT_POOL_TAG); >+ RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY)); >+ >+ /* Update Key, timestamp and recvdLen */ >+ NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof >(OVS_STT_PKT_KEY)); >+ >+ entry->recvdLen = fragmentLength; >+ >+ UINT64 currentTime; >+ NdisGetCurrentSystemTime((LARGE_INTEGER *) ¤tTime); >+ entry->timeout = currentTime + STT_ENTRY_TIMEOUT; >+ >+ if (segOffset == 0) { >+ entry->sttHdr = *sttHdr; >+ } >+ >+ /* Copy the data from Source to new buffer */ >+ entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen, >+ OVS_STT_POOL_TAG); >+ if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset, >+ entry->packetBuf + offset) == NULL) { >+ OVS_LOG_ERROR("Error when obtaining bytes from Packet"); >+ goto handle_error; >+ } >+ >+ /* Insert the entry in the Static Buffer */ >+ InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], >+ &entry->link); >+ } else { >+ /* Add to recieved length to identify if this is the last >fragment */ >+ pktFragEntry->recvdLen += fragmentLength; >+ lastPacket = (pktFragEntry->recvdLen == innerPacketLen); >+ >+ if (segOffset == 0) { >+ pktFragEntry->sttHdr = *sttHdr; >+ } >+ >+ /* Copy the fragment data from Source to existing buffer */ >+ if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset, >+ pktFragEntry->packetBuf + offset) == NULL) >{ >+ OVS_LOG_ERROR("Error when obtaining bytes from Packet"); >+ goto handle_error; >+ } >+ } >+ >+handle_error: >+ if (lastPacket) { >+ /* Retrieve the original STT header */ >+ NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof >(SttHdr)); >+ targetPNbl = OvsAllocateNBLFromBuffer(switchContext, >pktFragEntry->packetBuf, >+ innerPacketLen); >+ >+ /* Delete this entry and free up the memory/ */ >+ RemoveEntryList(&pktFragEntry->link); >+ OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG); >+ OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG); >+ } >+ >+ NdisReleaseSpinLock(&OvsSttSpinLock); >+ return lastPacket ? targetPNbl : NULL; >+} >+ > /* > * >-------------------------------------------------------------------------- > * OvsDecapStt -- >@@ -513,34 +694,20 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, > PNET_BUFFER_LIST *newNbl) > { > NDIS_STATUS status = NDIS_STATUS_FAILURE; >- PNET_BUFFER curNb; >+ PNET_BUFFER curNb, newNb; > IPHdr *ipHdr; > char *ipBuf[sizeof(IPHdr)]; >+ SttHdr stt; > SttHdr *sttHdr; > char *sttBuf[STT_HDR_LEN]; > UINT32 advanceCnt, hdrLen; >- NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; >+ BOOLEAN isLsoPacket = FALSE; > > curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); > ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); > >- if (NET_BUFFER_DATA_LENGTH(curNb) < OvsGetSttTunHdrSize()) { >- OVS_LOG_ERROR("Packet length received is less than the tunnel >header:" >- " %d<%d\n", NET_BUFFER_DATA_LENGTH(curNb), >OvsGetSttTunHdrSize()); >- return NDIS_STATUS_INVALID_LENGTH; >- } >- >- /* Verify outer TCP Checksum */ >- csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, >- >TcpIpChecksumNetBufferListInfo); >- >- /* Check if NIC has indicated TCP checksum failure */ >- if (csumInfo.Receive.TcpChecksumFailed) { >- return NDIS_STATUS_INVALID_PACKET; >- } >- >- /* Calculate the TCP Checksum */ >- status = OvsCalculateTCPChecksum(curNbl, curNb); >+ /* Validate the TCP Checksum */ >+ status = OvsValidateTCPChecksum(curNbl, curNb); > if (status != NDIS_STATUS_SUCCESS) { > return status; > } >@@ -554,34 +721,73 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, > 1 /*no align*/, 0); > ASSERT(ipHdr); > >+ TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); >+ > /* Skip IP & TCP headers */ > hdrLen = sizeof(IPHdr) + sizeof(TCPHdr), > NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); > advanceCnt += hdrLen; > >- /* STT Header */ >- sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, (PVOID) &sttBuf, >- 1 /*no align*/, 0); >+ UINT32 seq = ntohl(tcp->seq); >+ UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT); >+ UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len) >+ - (ipHdr->ihl * 4) >+ - (sizeof * tcp); >+ >+ /* Check if incoming packet requires reassembly */ >+ if (totalLen != payloadLen) { >+ sttHdr = &stt; >+ PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl, >+ ipHdr, tcp, sttHdr, >+ payloadLen); >+ if (pNbl == NULL) { >+ return NDIS_STATUS_SUCCESS; >+ } >+ >+ *newNbl = pNbl; >+ isLsoPacket = TRUE; >+ } else { >+ /* STT Header */ >+ sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, >+ (PVOID) &sttBuf, 1 /*no align*/, 0); >+ /* Skip stt header, DataOffset points to inner pkt now. */ >+ hdrLen = STT_HDR_LEN; >+ NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); >+ advanceCnt += hdrLen; >+ >+ *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, >+ 0, FALSE /*copy NBL info*/); >+ } >+ >+ if (*newNbl == NULL) { >+ OVS_LOG_ERROR("Unable to allocate a new cloned NBL"); >+ return NDIS_STATUS_RESOURCES; >+ } >+ >+ status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL); >+ if (status != NDIS_STATUS_SUCCESS) { >+ OvsCompleteNBL(switchContext, *newNbl, TRUE); >+ return NDIS_STATUS_FAILURE; >+ } >+ newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl); >+ > ASSERT(sttHdr); > > /* Initialize the tunnel key */ > tunKey->dst = ipHdr->daddr; > tunKey->src = ipHdr->saddr; > tunKey->tunnelId = sttHdr->key; >- tunKey->flags = (OVS_TNL_F_CSUM | OVS_TNL_F_KEY); >+ tunKey->flags = OVS_TNL_F_KEY; > tunKey->tos = ipHdr->tos; > tunKey->ttl = ipHdr->ttl; > tunKey->pad = 0; > >- /* Skip stt header, DataOffset points to inner pkt now. */ >- hdrLen = STT_HDR_LEN; >- NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); >- advanceCnt += hdrLen; >+ BOOLEAN requiresLSO = sttHdr->mss != 0; > > /* Verify checksum for inner packet if it's required */ > if (!(sttHdr->flags & STT_CSUM_VERIFIED)) { > BOOLEAN innerChecksumPartial = sttHdr->flags & STT_CSUM_PARTIAL; >- EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr), >+ EthHdr *eth = (EthHdr *)NdisGetDataBuffer(newNb, sizeof(EthHdr), > NULL, 1, 0); > > /* XXX Figure out a way to offload checksum receives */ >@@ -597,14 +803,16 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, > IPPROTO_TCP, > (UINT16)l4Payload); > } >- tcp->check = CalculateChecksumNB(curNb, l4Payload, >offset); >+ if (!requiresLSO) { >+ tcp->check = CalculateChecksumNB(newNb, l4Payload, >offset); >+ } > } else if (ip->protocol == IPPROTO_UDP) { > UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip); > if (!innerChecksumPartial){ > udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr, > IPPROTO_UDP, >l4Payload); > } >- udp->check = CalculateChecksumNB(curNb, l4Payload, >offset); >+ udp->check = CalculateChecksumNB(newNb, l4Payload, >offset); > } > } else if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV6)) { > IPv6Hdr *ip = (IPv6Hdr *)((PCHAR)eth + sizeof *eth); >@@ -617,7 +825,9 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, > (UINT32 *)&ip->daddr, > IPPROTO_TCP, >totalLength); > } >- tcp->check = CalculateChecksumNB(curNb, totalLength, >offset); >+ if (!requiresLSO) { >+ tcp->check = CalculateChecksumNB(newNb, totalLength, >offset); >+ } > } > else if (ip->nexthdr == IPPROTO_UDP) { > UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip); >@@ -626,23 +836,27 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, > (UINT32 *)&ip->daddr, > IPPROTO_UDP, >totalLength); > } >- udp->check = CalculateChecksumNB(curNb, totalLength, >offset); >+ udp->check = CalculateChecksumNB(newNb, totalLength, >offset); > } > } > >- NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0; >+ NET_BUFFER_LIST_INFO(*newNbl, TcpIpChecksumNetBufferListInfo) = >0; > } > >- *newNbl = OvsPartialCopyNBL(switchContext, curNbl, >OVS_DEFAULT_COPY_SIZE, >- 0, FALSE /*copy NBL info*/); >- >- ASSERT(advanceCnt == OvsGetSttTunHdrSize()); >- status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL); >- >- if (*newNbl == NULL) { >- OVS_LOG_ERROR("OvsDecapStt: Unable to allocate a new cloned >NBL"); >- status = NDIS_STATUS_RESOURCES; >+ if (requiresLSO) { >+ NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; >+ lsoInfo.Value = 0; >+ lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset; >+ lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU - sizeof(IPHdr) - >sizeof(TCPHdr); >+ lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; >+ if (sttHdr->flags & STT_PROTO_IPV4) { >+ lsoInfo.LsoV2Transmit.IPVersion = >NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; >+ } else { >+ lsoInfo.LsoV2Transmit.IPVersion = >NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6; >+ } >+ NET_BUFFER_LIST_INFO(*newNbl, >+ TcpLargeSendNetBufferListInfo) = >lsoInfo.Value; > } > >- return status; >+ return NDIS_STATUS_SUCCESS; > } >-- >1.9.5.msysgit.0 >
> Acked-by: Nithin Raju <nithin@vmware.com> Series applied. > > -----Original Message----- > From: Sairam Venugopal <vsairam@vmware.com> > Date: Tuesday, October 27, 2015 at 10:20 AM > To: Nithin Raju <nithin@vmware.com> > Subject: Fw: [PATCH v2 3/3] datapath-windows: STT - Enable support for TCP > Segmentation offloads > >> >>________________________________________ >>From: Sairam Venugopal <vsairam@vmware.com> >>Sent: Monday, October 26, 2015 4:48 PM >>To: dev@openvswitch.org >>Cc: Sairam Venugopal >>Subject: [PATCH v2 3/3] datapath-windows: STT - Enable support for TCP >>Segmentation offloads >> >>Add support to STT - Encap and Decap functions to reassemble the packet >>fragments. Also add support to offload the packet to NDIS. >> >>Signed-off-by: Sairam Venugopal <vsairam@vmware.com> >>--- >> datapath-windows/ovsext/Actions.c | 40 ++-- >> datapath-windows/ovsext/Stt.c | 398 >>+++++++++++++++++++++++++++++--------- >> 2 files changed, 329 insertions(+), 109 deletions(-) >> >>diff --git a/datapath-windows/ovsext/Actions.c >>b/datapath-windows/ovsext/Actions.c >>index b4644a7..ce592b3 100644 >>--- a/datapath-windows/ovsext/Actions.c >>+++ b/datapath-windows/ovsext/Actions.c >>@@ -594,7 +594,7 @@ OvsDoFlowLookupOutput(OvsForwardingContext *ovsFwdCtx) >> InitializeListHead(&missedPackets); >> status = OvsCreateAndAddPackets(NULL, 0, OVS_PACKET_CMD_MISS, >>vport, >> &key,ovsFwdCtx->curNbl, >>- ovsFwdCtx->tunnelRxNic != NULL, >>&ovsFwdCtx->layers, >>+ FALSE, &ovsFwdCtx->layers, >> ovsFwdCtx->switchContext, &missedPackets, >>&num); >> if (num) { >> OvsQueuePackets(&missedPackets, num); >>@@ -709,6 +709,7 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) >> NDIS_STATUS status = NDIS_STATUS_SUCCESS; >> PNET_BUFFER_LIST newNbl = NULL; >> POVS_VPORT_ENTRY tunnelRxVport = ovsFwdCtx->tunnelRxNic; >>+ PCWSTR dropReason = L"OVS-dropped due to new decap packet"; >> >> if (OvsValidateIPChecksum(ovsFwdCtx->curNbl, &ovsFwdCtx->layers) >> != NDIS_STATUS_SUCCESS) { >>@@ -730,6 +731,10 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) >> case OVS_VPORT_TYPE_STT: >> status = OvsDecapStt(ovsFwdCtx->switchContext, ovsFwdCtx->curNbl, >> &ovsFwdCtx->tunKey, &newNbl); >>+ if (status == NDIS_STATUS_SUCCESS && newNbl == NULL) { >>+ /* This was an STT-LSO Fragment */ >>+ dropReason = L"OVS-STT segment is cached"; >>+ } >> break; >> default: >> OVS_LOG_ERROR("Rx: Unhandled tunnel type: %d\n", >>@@ -747,25 +752,26 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) >> * tunnelRxNic and other fields will be cleared, re-init the context >> * before usage. >> */ >>- OvsCompleteNBLForwardingCtx(ovsFwdCtx, >>- L"OVS-dropped due to new decap packet"); >>+ OvsCompleteNBLForwardingCtx(ovsFwdCtx, dropReason); >> >>- /* Decapsulated packet is in a new NBL */ >>- ovsFwdCtx->tunnelRxNic = tunnelRxVport; >>- OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext, >>- newNbl, tunnelRxVport->portNo, 0, >>- >>NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl), >>- ovsFwdCtx->completionList, >>- &ovsFwdCtx->layers, FALSE); >>+ if (newNbl) { >>+ /* Decapsulated packet is in a new NBL */ >>+ ovsFwdCtx->tunnelRxNic = tunnelRxVport; >>+ OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext, >>+ newNbl, tunnelRxVport->portNo, 0, >>+ >>NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl), >>+ ovsFwdCtx->completionList, >>+ &ovsFwdCtx->layers, FALSE); >> >>- /* >>- * Set the NBL's SourcePortId and SourceNicIndex to default values to >>- * keep NDIS happy when we forward the packet. >>- */ >>- ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID; >>- ovsFwdCtx->fwdDetail->SourceNicIndex = 0; >>+ /* >>+ * Set the NBL's SourcePortId and SourceNicIndex to default >>values to >>+ * keep NDIS happy when we forward the packet. >>+ */ >>+ ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID; >>+ ovsFwdCtx->fwdDetail->SourceNicIndex = 0; >> >>- status = OvsDoFlowLookupOutput(ovsFwdCtx); >>+ status = OvsDoFlowLookupOutput(ovsFwdCtx); >>+ } >> ASSERT(ovsFwdCtx->curNbl == NULL); >> OvsClearTunRxCtx(ovsFwdCtx); >> >>diff --git a/datapath-windows/ovsext/Stt.c b/datapath-windows/ovsext/Stt.c >>index b78ef95..ef44d23 100644 >>--- a/datapath-windows/ovsext/Stt.c >>+++ b/datapath-windows/ovsext/Stt.c >>@@ -34,6 +34,7 @@ >> #endif >> #define OVS_DBG_MOD OVS_DBG_STT >> #include "Debug.h" >>+#include "Jhash.h" >> >> KSTART_ROUTINE OvsSttDefragCleaner; >> static PLIST_ENTRY OvsSttPktFragHash; >>@@ -152,8 +153,8 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, >> UINT32 headRoom = OvsGetSttTunHdrSize(); >> UINT32 tcpChksumLen; >> PUINT8 bufferStart; >>- >>- UNREFERENCED_PARAMETER(layers); >>+ ULONG mss = 0; >>+ NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; >> >> curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >> >>@@ -162,14 +163,20 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, >> BOOLEAN innerPartialChecksum = FALSE; >> >> if (layers->isTcp) { >>- NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; >>- >> lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl, >> TcpLargeSendNetBufferListInfo); >>- if (lsoInfo.LsoV1Transmit.MSS) { >>- /* XXX We don't handle LSO yet */ >>- OVS_LOG_ERROR("LSO on STT is not supported"); >>- return NDIS_STATUS_FAILURE; >>+ >>+ switch (lsoInfo.Transmit.Type) { >>+ case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE: >>+ mss = lsoInfo.LsoV1Transmit.MSS; >>+ break; >>+ case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE: >>+ mss = lsoInfo.LsoV2Transmit.MSS; >>+ break; >>+ default: >>+ OVS_LOG_ERROR("Unknown LSO transmit type:%d", >>+ lsoInfo.Transmit.Type); >>+ return NDIS_STATUS_FAILURE; >> } >> } >> >>@@ -186,21 +193,36 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, >> return NDIS_STATUS_FAILURE; >> } >> >>- curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl); >>+ curNbl = *newNbl; >>+ curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >> curMdl = NET_BUFFER_CURRENT_MDL(curNb); >>+ /* NB Chain should be split before */ >>+ ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); >>+ innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); >>+ >> bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, >> LowPagePriority); >> bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); >> >>- if (layers->isIPv4 && csumInfo.Transmit.IpHeaderChecksum) { >>+ if (layers->isIPv4) { >> IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset); >>- ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); >>+ if (!ip->tot_len) { >>+ ip->tot_len = htons(innerFrameLen - sizeof(EthHdr)); >>+ } >>+ if (!ip->check) { >>+ ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); >>+ } >> } >>+ >> if (layers->isTcp) { >>- if(!csumInfo.Transmit.TcpChecksum) { >>- innerChecksumVerified = TRUE; >>- } else { >>+ if (mss) { >> innerPartialChecksum = TRUE; >>+ } else { >>+ if (!csumInfo.Transmit.TcpChecksum) { >>+ innerChecksumVerified = TRUE; >>+ } else { >>+ innerPartialChecksum = TRUE; >>+ } >> } >> } else if (layers->isUdp) { >> if(!csumInfo.Transmit.UdpChecksum) { >>@@ -210,24 +232,6 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, >> } >> } >> >>- curNbl = *newNbl; >>- curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >>- /* NB Chain should be split before */ >>- ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); >>- >>- innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); >>- /* >>- * External port can't be removed as we hold the dispatch lock >>- * We also check if the external port was removed beforecalling >>- * port encapsulation functions >>- */ >>- if (innerFrameLen > OvsGetExternalMtu(switchContext) - headRoom) { >>- OVS_LOG_ERROR("Packet too large (size %d, mtu %d). Can't >>encapsulate", >>- innerFrameLen, OvsGetExternalMtu(switchContext)); >>- status = NDIS_STATUS_FAILURE; >>- goto ret_error; >>- } >>- >> status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL); >> if (status != NDIS_STATUS_SUCCESS) { >> ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)"); >>@@ -301,33 +305,52 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, >> IPPROTO_TCP, (uint16) >>tcpChksumLen); >> sttHdr->version = 0; >> >>- /* XXX need to peek into the inner packet, hard code for now */ >>- sttHdr->flags = STT_PROTO_IPV4; >>- if (innerChecksumVerified) { >>- sttHdr->flags |= STT_CSUM_VERIFIED; >>- } else if (innerPartialChecksum) { >>+ /* Set STT Header */ >>+ sttHdr->flags = 0; >>+ if (innerPartialChecksum) { >> sttHdr->flags |= STT_CSUM_PARTIAL; >>+ if (layers->isIPv4) { >>+ sttHdr->flags |= STT_PROTO_IPV4; >>+ } >>+ if (layers->isTcp) { >>+ sttHdr->flags |= STT_PROTO_TCP; >>+ } >>+ sttHdr->l4Offset = (UINT8) layers->l4Offset; >>+ sttHdr->mss = (UINT16) htons(mss); >>+ } else if (innerChecksumVerified) { >>+ sttHdr->flags = STT_CSUM_VERIFIED; >>+ sttHdr->l4Offset = 0; >>+ sttHdr->mss = 0; >> } >>- sttHdr->l4Offset = 0; >> >> sttHdr->reserved = 0; >>- /* XXX Used for large TCP packets.Not sure how it is used, clarify */ >>- sttHdr->mss = 0; >> sttHdr->vlanTCI = 0; >> sttHdr->key = tunKey->tunnelId; >> /* Zero out stt padding */ >> *(uint16 *)(sttHdr + 1) = 0; >> >> /* Offload IP and TCP checksum */ >>+ ULONG tcpHeaderOffset = sizeof *outerEthHdr + >>+ outerIpHdr->ihl * 4; >> csumInfo.Value = 0; >> csumInfo.Transmit.IpHeaderChecksum = 1; >> csumInfo.Transmit.TcpChecksum = 1; >> csumInfo.Transmit.IsIPv4 = 1; >>- csumInfo.Transmit.TcpHeaderOffset = sizeof *outerEthHdr + >>- outerIpHdr->ihl * 4; >>+ csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset; >> NET_BUFFER_LIST_INFO(curNbl, >> TcpIpChecksumNetBufferListInfo) = >>csumInfo.Value; >> >>+ UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) - >>sizeof(TCPHdr); >>+ if (ipTotalLen > encapMss) { >>+ lsoInfo.Value = 0; >>+ lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset; >>+ lsoInfo.LsoV2Transmit.MSS = encapMss; >>+ lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; >>+ lsoInfo.LsoV2Transmit.IPVersion = >>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; >>+ NET_BUFFER_LIST_INFO(curNbl, >>+ TcpLargeSendNetBufferListInfo) = >>lsoInfo.Value; >>+ } >>+ >> return STATUS_SUCCESS; >> >> ret_error: >>@@ -338,16 +361,22 @@ ret_error: >> >> /* >> >>*------------------------------------------------------------------------- >>--- >>- * OvsCalculateTCPChecksum >>- * Calculate TCP checksum >>+ * OvsValidateTCPChecksum >>+ * Validate TCP checksum >> >>*------------------------------------------------------------------------- >>--- >> */ >> static __inline NDIS_STATUS >>-OvsCalculateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb) >>+OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb) >> { >> NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; >> csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, >> >>TcpIpChecksumNetBufferListInfo); >>+ >>+ /* Check if NIC has indicated TCP checksum failure */ >>+ if (csumInfo.Receive.TcpChecksumFailed) { >>+ return NDIS_STATUS_INVALID_PACKET; >>+ } >>+ >> UINT16 checkSum; >> >> /* Check if TCP Checksum has been calculated by NIC */ >>@@ -399,10 +428,9 @@ OvsInitSttDefragmentation() >> NdisAllocateSpinLock(&OvsSttSpinLock); >> >> /* Init the Hash Buffer */ >>- OvsSttPktFragHash = (PLIST_ENTRY) OvsAllocateMemoryWithTag( >>- sizeof(LIST_ENTRY) >>- * STT_HASH_TABLE_SIZE, >>- OVS_STT_POOL_TAG); >>+ OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY) >>+ * STT_HASH_TABLE_SIZE, >>+ OVS_STT_POOL_TAG); >> if (OvsSttPktFragHash == NULL) { >> NdisFreeSpinLock(&OvsSttSpinLock); >> return STATUS_INSUFFICIENT_RESOURCES; >>@@ -487,6 +515,7 @@ OvsSttDefragCleaner(PVOID data) >> entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link); >> if (entry->timeout < currentTime) { >> RemoveEntryList(&entry->link); >>+ OvsFreeMemoryWithTag(entry->packetBuf, >>OVS_STT_POOL_TAG); >> OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG); >> } >> } >>@@ -500,6 +529,158 @@ OvsSttDefragCleaner(PVOID data) >> PsTerminateSystemThread(STATUS_SUCCESS); >> } >> >>+static OVS_STT_PKT_KEY >>+OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr) >>+{ >>+ OVS_STT_PKT_KEY key; >>+ key.sAddr = ipHdr->saddr; >>+ key.dAddr = ipHdr->daddr; >>+ key.ackSeq = ntohl(tcpHdr->ack_seq); >>+ return key; >>+} >>+ >>+static UINT32 >>+OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey) >>+{ >>+ UINT32 arr[3]; >>+ arr[0] = pktKey->ackSeq; >>+ arr[1] = pktKey->dAddr; >>+ arr[2] = pktKey->sAddr; >>+ return OvsJhashWords(arr, 3, OVS_HASH_BASIS); >>+} >>+ >>+static VOID * >>+OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash) >>+{ >>+ PLIST_ENTRY link; >>+ POVS_STT_PKT_ENTRY entry; >>+ >>+ LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) { >>+ entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link); >>+ if (entry->ovsPktKey.ackSeq == pktKey->ackSeq && >>+ entry->ovsPktKey.dAddr == pktKey->dAddr && >>+ entry->ovsPktKey.sAddr == pktKey->sAddr) { >>+ return entry; >>+ } >>+ } >>+ return NULL; >>+} >>+ >>+/* >>+* >>+------------------------------------------------------------------------- >>- >>+* OvsSttReassemble -- >>+* Reassemble an LSO packet from multiple STT-Fragments. >>+* >>+------------------------------------------------------------------------- >>- >>+*/ >>+PNET_BUFFER_LIST >>+OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext, >>+ PNET_BUFFER_LIST curNbl, >>+ IPHdr *ipHdr, >>+ TCPHdr *tcp, >>+ SttHdr *newSttHdr, >>+ UINT16 payloadLen) >>+{ >>+ UINT32 seq = ntohl(tcp->seq); >>+ UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN; >>+ UINT32 segOffset = STT_SEGMENT_OFF(seq); >>+ UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN; >>+ UINT32 startOffset = 0; >>+ OVS_STT_PKT_ENTRY *pktFragEntry; >>+ PNET_BUFFER_LIST targetPNbl = NULL; >>+ BOOLEAN lastPacket = FALSE; >>+ PNET_BUFFER sourceNb; >>+ UINT32 fragmentLength = payloadLen; >>+ SttHdr stt; >>+ SttHdr *sttHdr = NULL; >>+ sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >>+ >>+ /* XXX optimize this lock */ >>+ NdisAcquireSpinLock(&OvsSttSpinLock); >>+ >>+ /* If this is the first fragment, copy the STT header */ >>+ if (segOffset == 0) { >>+ sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0); >>+ if (sttHdr == NULL) { >>+ OVS_LOG_ERROR("Unable to retrieve STT header"); >>+ return NULL; >>+ } >>+ fragmentLength = fragmentLength - STT_HDR_LEN; >>+ startOffset = startOffset + STT_HDR_LEN; >>+ } >>+ >>+ /* Lookup fragment */ >>+ OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp); >>+ UINT32 hash = OvsSttGetPktHash(&pktKey); >>+ pktFragEntry = OvsLookupPktFrag(&pktKey, hash); >>+ >>+ if (pktFragEntry == NULL) { >>+ /* Create a new Packet Entry */ >>+ POVS_STT_PKT_ENTRY entry; >>+ entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY), >>+ OVS_STT_POOL_TAG); >>+ RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY)); >>+ >>+ /* Update Key, timestamp and recvdLen */ >>+ NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof >>(OVS_STT_PKT_KEY)); >>+ >>+ entry->recvdLen = fragmentLength; >>+ >>+ UINT64 currentTime; >>+ NdisGetCurrentSystemTime((LARGE_INTEGER *) ¤tTime); >>+ entry->timeout = currentTime + STT_ENTRY_TIMEOUT; >>+ >>+ if (segOffset == 0) { >>+ entry->sttHdr = *sttHdr; >>+ } >>+ >>+ /* Copy the data from Source to new buffer */ >>+ entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen, >>+ OVS_STT_POOL_TAG); >>+ if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset, >>+ entry->packetBuf + offset) == NULL) { >>+ OVS_LOG_ERROR("Error when obtaining bytes from Packet"); >>+ goto handle_error; >>+ } >>+ >>+ /* Insert the entry in the Static Buffer */ >>+ InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], >>+ &entry->link); >>+ } else { >>+ /* Add to recieved length to identify if this is the last >>fragment */ >>+ pktFragEntry->recvdLen += fragmentLength; >>+ lastPacket = (pktFragEntry->recvdLen == innerPacketLen); >>+ >>+ if (segOffset == 0) { >>+ pktFragEntry->sttHdr = *sttHdr; >>+ } >>+ >>+ /* Copy the fragment data from Source to existing buffer */ >>+ if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset, >>+ pktFragEntry->packetBuf + offset) == NULL) >>{ >>+ OVS_LOG_ERROR("Error when obtaining bytes from Packet"); >>+ goto handle_error; >>+ } >>+ } >>+ >>+handle_error: >>+ if (lastPacket) { >>+ /* Retrieve the original STT header */ >>+ NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof >>(SttHdr)); >>+ targetPNbl = OvsAllocateNBLFromBuffer(switchContext, >>pktFragEntry->packetBuf, >>+ innerPacketLen); >>+ >>+ /* Delete this entry and free up the memory/ */ >>+ RemoveEntryList(&pktFragEntry->link); >>+ OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG); >>+ OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG); >>+ } >>+ >>+ NdisReleaseSpinLock(&OvsSttSpinLock); >>+ return lastPacket ? targetPNbl : NULL; >>+} >>+ >> /* >> * >>-------------------------------------------------------------------------- >> * OvsDecapStt -- >>@@ -513,34 +694,20 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, >> PNET_BUFFER_LIST *newNbl) >> { >> NDIS_STATUS status = NDIS_STATUS_FAILURE; >>- PNET_BUFFER curNb; >>+ PNET_BUFFER curNb, newNb; >> IPHdr *ipHdr; >> char *ipBuf[sizeof(IPHdr)]; >>+ SttHdr stt; >> SttHdr *sttHdr; >> char *sttBuf[STT_HDR_LEN]; >> UINT32 advanceCnt, hdrLen; >>- NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; >>+ BOOLEAN isLsoPacket = FALSE; >> >> curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); >> ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); >> >>- if (NET_BUFFER_DATA_LENGTH(curNb) < OvsGetSttTunHdrSize()) { >>- OVS_LOG_ERROR("Packet length received is less than the tunnel >>header:" >>- " %d<%d\n", NET_BUFFER_DATA_LENGTH(curNb), >>OvsGetSttTunHdrSize()); >>- return NDIS_STATUS_INVALID_LENGTH; >>- } >>- >>- /* Verify outer TCP Checksum */ >>- csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, >>- >>TcpIpChecksumNetBufferListInfo); >>- >>- /* Check if NIC has indicated TCP checksum failure */ >>- if (csumInfo.Receive.TcpChecksumFailed) { >>- return NDIS_STATUS_INVALID_PACKET; >>- } >>- >>- /* Calculate the TCP Checksum */ >>- status = OvsCalculateTCPChecksum(curNbl, curNb); >>+ /* Validate the TCP Checksum */ >>+ status = OvsValidateTCPChecksum(curNbl, curNb); >> if (status != NDIS_STATUS_SUCCESS) { >> return status; >> } >>@@ -554,34 +721,73 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, >> 1 /*no align*/, 0); >> ASSERT(ipHdr); >> >>+ TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); >>+ >> /* Skip IP & TCP headers */ >> hdrLen = sizeof(IPHdr) + sizeof(TCPHdr), >> NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); >> advanceCnt += hdrLen; >> >>- /* STT Header */ >>- sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, (PVOID) &sttBuf, >>- 1 /*no align*/, 0); >>+ UINT32 seq = ntohl(tcp->seq); >>+ UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT); >>+ UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len) >>+ - (ipHdr->ihl * 4) >>+ - (sizeof * tcp); >>+ >>+ /* Check if incoming packet requires reassembly */ >>+ if (totalLen != payloadLen) { >>+ sttHdr = &stt; >>+ PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl, >>+ ipHdr, tcp, sttHdr, >>+ payloadLen); >>+ if (pNbl == NULL) { >>+ return NDIS_STATUS_SUCCESS; >>+ } >>+ >>+ *newNbl = pNbl; >>+ isLsoPacket = TRUE; >>+ } else { >>+ /* STT Header */ >>+ sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, >>+ (PVOID) &sttBuf, 1 /*no align*/, 0); >>+ /* Skip stt header, DataOffset points to inner pkt now. */ >>+ hdrLen = STT_HDR_LEN; >>+ NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); >>+ advanceCnt += hdrLen; >>+ >>+ *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, >>+ 0, FALSE /*copy NBL info*/); >>+ } >>+ >>+ if (*newNbl == NULL) { >>+ OVS_LOG_ERROR("Unable to allocate a new cloned NBL"); >>+ return NDIS_STATUS_RESOURCES; >>+ } >>+ >>+ status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL); >>+ if (status != NDIS_STATUS_SUCCESS) { >>+ OvsCompleteNBL(switchContext, *newNbl, TRUE); >>+ return NDIS_STATUS_FAILURE; >>+ } >>+ newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl); >>+ >> ASSERT(sttHdr); >> >> /* Initialize the tunnel key */ >> tunKey->dst = ipHdr->daddr; >> tunKey->src = ipHdr->saddr; >> tunKey->tunnelId = sttHdr->key; >>- tunKey->flags = (OVS_TNL_F_CSUM | OVS_TNL_F_KEY); >>+ tunKey->flags = OVS_TNL_F_KEY; >> tunKey->tos = ipHdr->tos; >> tunKey->ttl = ipHdr->ttl; >> tunKey->pad = 0; >> >>- /* Skip stt header, DataOffset points to inner pkt now. */ >>- hdrLen = STT_HDR_LEN; >>- NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); >>- advanceCnt += hdrLen; >>+ BOOLEAN requiresLSO = sttHdr->mss != 0; >> >> /* Verify checksum for inner packet if it's required */ >> if (!(sttHdr->flags & STT_CSUM_VERIFIED)) { >> BOOLEAN innerChecksumPartial = sttHdr->flags & STT_CSUM_PARTIAL; >>- EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr), >>+ EthHdr *eth = (EthHdr *)NdisGetDataBuffer(newNb, sizeof(EthHdr), >> NULL, 1, 0); >> >> /* XXX Figure out a way to offload checksum receives */ >>@@ -597,14 +803,16 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, >> IPPROTO_TCP, >> (UINT16)l4Payload); >> } >>- tcp->check = CalculateChecksumNB(curNb, l4Payload, >>offset); >>+ if (!requiresLSO) { >>+ tcp->check = CalculateChecksumNB(newNb, l4Payload, >>offset); >>+ } >> } else if (ip->protocol == IPPROTO_UDP) { >> UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip); >> if (!innerChecksumPartial){ >> udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr, >> IPPROTO_UDP, >>l4Payload); >> } >>- udp->check = CalculateChecksumNB(curNb, l4Payload, >>offset); >>+ udp->check = CalculateChecksumNB(newNb, l4Payload, >>offset); >> } >> } else if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV6)) { >> IPv6Hdr *ip = (IPv6Hdr *)((PCHAR)eth + sizeof *eth); >>@@ -617,7 +825,9 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, >> (UINT32 *)&ip->daddr, >> IPPROTO_TCP, >>totalLength); >> } >>- tcp->check = CalculateChecksumNB(curNb, totalLength, >>offset); >>+ if (!requiresLSO) { >>+ tcp->check = CalculateChecksumNB(newNb, totalLength, >>offset); >>+ } >> } >> else if (ip->nexthdr == IPPROTO_UDP) { >> UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip); >>@@ -626,23 +836,27 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, >> (UINT32 *)&ip->daddr, >> IPPROTO_UDP, >>totalLength); >> } >>- udp->check = CalculateChecksumNB(curNb, totalLength, >>offset); >>+ udp->check = CalculateChecksumNB(newNb, totalLength, >>offset); >> } >> } >> >>- NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0; >>+ NET_BUFFER_LIST_INFO(*newNbl, TcpIpChecksumNetBufferListInfo) = >>0; >> } >> >>- *newNbl = OvsPartialCopyNBL(switchContext, curNbl, >>OVS_DEFAULT_COPY_SIZE, >>- 0, FALSE /*copy NBL info*/); >>- >>- ASSERT(advanceCnt == OvsGetSttTunHdrSize()); >>- status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL); >>- >>- if (*newNbl == NULL) { >>- OVS_LOG_ERROR("OvsDecapStt: Unable to allocate a new cloned >>NBL"); >>- status = NDIS_STATUS_RESOURCES; >>+ if (requiresLSO) { >>+ NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; >>+ lsoInfo.Value = 0; >>+ lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset; >>+ lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU - sizeof(IPHdr) - >>sizeof(TCPHdr); >>+ lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; >>+ if (sttHdr->flags & STT_PROTO_IPV4) { >>+ lsoInfo.LsoV2Transmit.IPVersion = >>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; >>+ } else { >>+ lsoInfo.LsoV2Transmit.IPVersion = >>NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6; >>+ } >>+ NET_BUFFER_LIST_INFO(*newNbl, >>+ TcpLargeSendNetBufferListInfo) = >>lsoInfo.Value; >> } >> >>- return status; >>+ return NDIS_STATUS_SUCCESS; >> } >>-- >>1.9.5.msysgit.0 >> > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev
diff --git a/datapath-windows/ovsext/Actions.c b/datapath-windows/ovsext/Actions.c index b4644a7..ce592b3 100644 --- a/datapath-windows/ovsext/Actions.c +++ b/datapath-windows/ovsext/Actions.c @@ -594,7 +594,7 @@ OvsDoFlowLookupOutput(OvsForwardingContext *ovsFwdCtx) InitializeListHead(&missedPackets); status = OvsCreateAndAddPackets(NULL, 0, OVS_PACKET_CMD_MISS, vport, &key,ovsFwdCtx->curNbl, - ovsFwdCtx->tunnelRxNic != NULL, &ovsFwdCtx->layers, + FALSE, &ovsFwdCtx->layers, ovsFwdCtx->switchContext, &missedPackets, &num); if (num) { OvsQueuePackets(&missedPackets, num); @@ -709,6 +709,7 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) NDIS_STATUS status = NDIS_STATUS_SUCCESS; PNET_BUFFER_LIST newNbl = NULL; POVS_VPORT_ENTRY tunnelRxVport = ovsFwdCtx->tunnelRxNic; + PCWSTR dropReason = L"OVS-dropped due to new decap packet"; if (OvsValidateIPChecksum(ovsFwdCtx->curNbl, &ovsFwdCtx->layers) != NDIS_STATUS_SUCCESS) { @@ -730,6 +731,10 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) case OVS_VPORT_TYPE_STT: status = OvsDecapStt(ovsFwdCtx->switchContext, ovsFwdCtx->curNbl, &ovsFwdCtx->tunKey, &newNbl); + if (status == NDIS_STATUS_SUCCESS && newNbl == NULL) { + /* This was an STT-LSO Fragment */ + dropReason = L"OVS-STT segment is cached"; + } break; default: OVS_LOG_ERROR("Rx: Unhandled tunnel type: %d\n", @@ -747,25 +752,26 @@ OvsTunnelPortRx(OvsForwardingContext *ovsFwdCtx) * tunnelRxNic and other fields will be cleared, re-init the context * before usage. */ - OvsCompleteNBLForwardingCtx(ovsFwdCtx, - L"OVS-dropped due to new decap packet"); + OvsCompleteNBLForwardingCtx(ovsFwdCtx, dropReason); - /* Decapsulated packet is in a new NBL */ - ovsFwdCtx->tunnelRxNic = tunnelRxVport; - OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext, - newNbl, tunnelRxVport->portNo, 0, - NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl), - ovsFwdCtx->completionList, - &ovsFwdCtx->layers, FALSE); + if (newNbl) { + /* Decapsulated packet is in a new NBL */ + ovsFwdCtx->tunnelRxNic = tunnelRxVport; + OvsInitForwardingCtx(ovsFwdCtx, ovsFwdCtx->switchContext, + newNbl, tunnelRxVport->portNo, 0, + NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(newNbl), + ovsFwdCtx->completionList, + &ovsFwdCtx->layers, FALSE); - /* - * Set the NBL's SourcePortId and SourceNicIndex to default values to - * keep NDIS happy when we forward the packet. - */ - ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID; - ovsFwdCtx->fwdDetail->SourceNicIndex = 0; + /* + * Set the NBL's SourcePortId and SourceNicIndex to default values to + * keep NDIS happy when we forward the packet. + */ + ovsFwdCtx->fwdDetail->SourcePortId = NDIS_SWITCH_DEFAULT_PORT_ID; + ovsFwdCtx->fwdDetail->SourceNicIndex = 0; - status = OvsDoFlowLookupOutput(ovsFwdCtx); + status = OvsDoFlowLookupOutput(ovsFwdCtx); + } ASSERT(ovsFwdCtx->curNbl == NULL); OvsClearTunRxCtx(ovsFwdCtx); diff --git a/datapath-windows/ovsext/Stt.c b/datapath-windows/ovsext/Stt.c index b78ef95..ef44d23 100644 --- a/datapath-windows/ovsext/Stt.c +++ b/datapath-windows/ovsext/Stt.c @@ -34,6 +34,7 @@ #endif #define OVS_DBG_MOD OVS_DBG_STT #include "Debug.h" +#include "Jhash.h" KSTART_ROUTINE OvsSttDefragCleaner; static PLIST_ENTRY OvsSttPktFragHash; @@ -152,8 +153,8 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, UINT32 headRoom = OvsGetSttTunHdrSize(); UINT32 tcpChksumLen; PUINT8 bufferStart; - - UNREFERENCED_PARAMETER(layers); + ULONG mss = 0; + NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); @@ -162,14 +163,20 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, BOOLEAN innerPartialChecksum = FALSE; if (layers->isTcp) { - NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; - lsoInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpLargeSendNetBufferListInfo); - if (lsoInfo.LsoV1Transmit.MSS) { - /* XXX We don't handle LSO yet */ - OVS_LOG_ERROR("LSO on STT is not supported"); - return NDIS_STATUS_FAILURE; + + switch (lsoInfo.Transmit.Type) { + case NDIS_TCP_LARGE_SEND_OFFLOAD_V1_TYPE: + mss = lsoInfo.LsoV1Transmit.MSS; + break; + case NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE: + mss = lsoInfo.LsoV2Transmit.MSS; + break; + default: + OVS_LOG_ERROR("Unknown LSO transmit type:%d", + lsoInfo.Transmit.Type); + return NDIS_STATUS_FAILURE; } } @@ -186,21 +193,36 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, return NDIS_STATUS_FAILURE; } - curNb = NET_BUFFER_LIST_FIRST_NB(*newNbl); + curNbl = *newNbl; + curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curMdl = NET_BUFFER_CURRENT_MDL(curNb); + /* NB Chain should be split before */ + ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); + innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); + bufferStart = (PUINT8)MmGetSystemAddressForMdlSafe(curMdl, LowPagePriority); bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); - if (layers->isIPv4 && csumInfo.Transmit.IpHeaderChecksum) { + if (layers->isIPv4) { IPHdr *ip = (IPHdr *)(bufferStart + layers->l3Offset); - ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); + if (!ip->tot_len) { + ip->tot_len = htons(innerFrameLen - sizeof(EthHdr)); + } + if (!ip->check) { + ip->check = IPChecksum((UINT8 *)ip, ip->ihl * 4, 0); + } } + if (layers->isTcp) { - if(!csumInfo.Transmit.TcpChecksum) { - innerChecksumVerified = TRUE; - } else { + if (mss) { innerPartialChecksum = TRUE; + } else { + if (!csumInfo.Transmit.TcpChecksum) { + innerChecksumVerified = TRUE; + } else { + innerPartialChecksum = TRUE; + } } } else if (layers->isUdp) { if(!csumInfo.Transmit.UdpChecksum) { @@ -210,24 +232,6 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, } } - curNbl = *newNbl; - curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); - /* NB Chain should be split before */ - ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); - - innerFrameLen = NET_BUFFER_DATA_LENGTH(curNb); - /* - * External port can't be removed as we hold the dispatch lock - * We also check if the external port was removed beforecalling - * port encapsulation functions - */ - if (innerFrameLen > OvsGetExternalMtu(switchContext) - headRoom) { - OVS_LOG_ERROR("Packet too large (size %d, mtu %d). Can't encapsulate", - innerFrameLen, OvsGetExternalMtu(switchContext)); - status = NDIS_STATUS_FAILURE; - goto ret_error; - } - status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL); if (status != NDIS_STATUS_SUCCESS) { ASSERT(!"Unable to NdisRetreatNetBufferDataStart(headroom)"); @@ -301,33 +305,52 @@ OvsDoEncapStt(POVS_VPORT_ENTRY vport, IPPROTO_TCP, (uint16) tcpChksumLen); sttHdr->version = 0; - /* XXX need to peek into the inner packet, hard code for now */ - sttHdr->flags = STT_PROTO_IPV4; - if (innerChecksumVerified) { - sttHdr->flags |= STT_CSUM_VERIFIED; - } else if (innerPartialChecksum) { + /* Set STT Header */ + sttHdr->flags = 0; + if (innerPartialChecksum) { sttHdr->flags |= STT_CSUM_PARTIAL; + if (layers->isIPv4) { + sttHdr->flags |= STT_PROTO_IPV4; + } + if (layers->isTcp) { + sttHdr->flags |= STT_PROTO_TCP; + } + sttHdr->l4Offset = (UINT8) layers->l4Offset; + sttHdr->mss = (UINT16) htons(mss); + } else if (innerChecksumVerified) { + sttHdr->flags = STT_CSUM_VERIFIED; + sttHdr->l4Offset = 0; + sttHdr->mss = 0; } - sttHdr->l4Offset = 0; sttHdr->reserved = 0; - /* XXX Used for large TCP packets.Not sure how it is used, clarify */ - sttHdr->mss = 0; sttHdr->vlanTCI = 0; sttHdr->key = tunKey->tunnelId; /* Zero out stt padding */ *(uint16 *)(sttHdr + 1) = 0; /* Offload IP and TCP checksum */ + ULONG tcpHeaderOffset = sizeof *outerEthHdr + + outerIpHdr->ihl * 4; csumInfo.Value = 0; csumInfo.Transmit.IpHeaderChecksum = 1; csumInfo.Transmit.TcpChecksum = 1; csumInfo.Transmit.IsIPv4 = 1; - csumInfo.Transmit.TcpHeaderOffset = sizeof *outerEthHdr + - outerIpHdr->ihl * 4; + csumInfo.Transmit.TcpHeaderOffset = tcpHeaderOffset; NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value; + UINT32 encapMss = OvsGetExternalMtu(switchContext) - sizeof(IPHdr) - sizeof(TCPHdr); + if (ipTotalLen > encapMss) { + lsoInfo.Value = 0; + lsoInfo.LsoV2Transmit.TcpHeaderOffset = tcpHeaderOffset; + lsoInfo.LsoV2Transmit.MSS = encapMss; + lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; + lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; + NET_BUFFER_LIST_INFO(curNbl, + TcpLargeSendNetBufferListInfo) = lsoInfo.Value; + } + return STATUS_SUCCESS; ret_error: @@ -338,16 +361,22 @@ ret_error: /* *---------------------------------------------------------------------------- - * OvsCalculateTCPChecksum - * Calculate TCP checksum + * OvsValidateTCPChecksum + * Validate TCP checksum *---------------------------------------------------------------------------- */ static __inline NDIS_STATUS -OvsCalculateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb) +OvsValidateTCPChecksum(PNET_BUFFER_LIST curNbl, PNET_BUFFER curNb) { NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo); + + /* Check if NIC has indicated TCP checksum failure */ + if (csumInfo.Receive.TcpChecksumFailed) { + return NDIS_STATUS_INVALID_PACKET; + } + UINT16 checkSum; /* Check if TCP Checksum has been calculated by NIC */ @@ -399,10 +428,9 @@ OvsInitSttDefragmentation() NdisAllocateSpinLock(&OvsSttSpinLock); /* Init the Hash Buffer */ - OvsSttPktFragHash = (PLIST_ENTRY) OvsAllocateMemoryWithTag( - sizeof(LIST_ENTRY) - * STT_HASH_TABLE_SIZE, - OVS_STT_POOL_TAG); + OvsSttPktFragHash = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY) + * STT_HASH_TABLE_SIZE, + OVS_STT_POOL_TAG); if (OvsSttPktFragHash == NULL) { NdisFreeSpinLock(&OvsSttSpinLock); return STATUS_INSUFFICIENT_RESOURCES; @@ -487,6 +515,7 @@ OvsSttDefragCleaner(PVOID data) entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link); if (entry->timeout < currentTime) { RemoveEntryList(&entry->link); + OvsFreeMemoryWithTag(entry->packetBuf, OVS_STT_POOL_TAG); OvsFreeMemoryWithTag(entry, OVS_STT_POOL_TAG); } } @@ -500,6 +529,158 @@ OvsSttDefragCleaner(PVOID data) PsTerminateSystemThread(STATUS_SUCCESS); } +static OVS_STT_PKT_KEY +OvsGeneratePacketKey(IPHdr *ipHdr, TCPHdr *tcpHdr) +{ + OVS_STT_PKT_KEY key; + key.sAddr = ipHdr->saddr; + key.dAddr = ipHdr->daddr; + key.ackSeq = ntohl(tcpHdr->ack_seq); + return key; +} + +static UINT32 +OvsSttGetPktHash(OVS_STT_PKT_KEY *pktKey) +{ + UINT32 arr[3]; + arr[0] = pktKey->ackSeq; + arr[1] = pktKey->dAddr; + arr[2] = pktKey->sAddr; + return OvsJhashWords(arr, 3, OVS_HASH_BASIS); +} + +static VOID * +OvsLookupPktFrag(OVS_STT_PKT_KEY *pktKey, UINT32 hash) +{ + PLIST_ENTRY link; + POVS_STT_PKT_ENTRY entry; + + LIST_FORALL(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], link) { + entry = CONTAINING_RECORD(link, OVS_STT_PKT_ENTRY, link); + if (entry->ovsPktKey.ackSeq == pktKey->ackSeq && + entry->ovsPktKey.dAddr == pktKey->dAddr && + entry->ovsPktKey.sAddr == pktKey->sAddr) { + return entry; + } + } + return NULL; +} + +/* +* +-------------------------------------------------------------------------- +* OvsSttReassemble -- +* Reassemble an LSO packet from multiple STT-Fragments. +* +-------------------------------------------------------------------------- +*/ +PNET_BUFFER_LIST +OvsSttReassemble(POVS_SWITCH_CONTEXT switchContext, + PNET_BUFFER_LIST curNbl, + IPHdr *ipHdr, + TCPHdr *tcp, + SttHdr *newSttHdr, + UINT16 payloadLen) +{ + UINT32 seq = ntohl(tcp->seq); + UINT32 innerPacketLen = (seq >> STT_SEQ_LEN_SHIFT) - STT_HDR_LEN; + UINT32 segOffset = STT_SEGMENT_OFF(seq); + UINT32 offset = segOffset == 0 ? 0 : segOffset - STT_HDR_LEN; + UINT32 startOffset = 0; + OVS_STT_PKT_ENTRY *pktFragEntry; + PNET_BUFFER_LIST targetPNbl = NULL; + BOOLEAN lastPacket = FALSE; + PNET_BUFFER sourceNb; + UINT32 fragmentLength = payloadLen; + SttHdr stt; + SttHdr *sttHdr = NULL; + sourceNb = NET_BUFFER_LIST_FIRST_NB(curNbl); + + /* XXX optimize this lock */ + NdisAcquireSpinLock(&OvsSttSpinLock); + + /* If this is the first fragment, copy the STT header */ + if (segOffset == 0) { + sttHdr = NdisGetDataBuffer(sourceNb, sizeof(SttHdr), &stt, 1, 0); + if (sttHdr == NULL) { + OVS_LOG_ERROR("Unable to retrieve STT header"); + return NULL; + } + fragmentLength = fragmentLength - STT_HDR_LEN; + startOffset = startOffset + STT_HDR_LEN; + } + + /* Lookup fragment */ + OVS_STT_PKT_KEY pktKey = OvsGeneratePacketKey(ipHdr, tcp); + UINT32 hash = OvsSttGetPktHash(&pktKey); + pktFragEntry = OvsLookupPktFrag(&pktKey, hash); + + if (pktFragEntry == NULL) { + /* Create a new Packet Entry */ + POVS_STT_PKT_ENTRY entry; + entry = OvsAllocateMemoryWithTag(sizeof(OVS_STT_PKT_ENTRY), + OVS_STT_POOL_TAG); + RtlZeroMemory(entry, sizeof (OVS_STT_PKT_ENTRY)); + + /* Update Key, timestamp and recvdLen */ + NdisMoveMemory(&entry->ovsPktKey, &pktKey, sizeof (OVS_STT_PKT_KEY)); + + entry->recvdLen = fragmentLength; + + UINT64 currentTime; + NdisGetCurrentSystemTime((LARGE_INTEGER *) ¤tTime); + entry->timeout = currentTime + STT_ENTRY_TIMEOUT; + + if (segOffset == 0) { + entry->sttHdr = *sttHdr; + } + + /* Copy the data from Source to new buffer */ + entry->packetBuf = OvsAllocateMemoryWithTag(innerPacketLen, + OVS_STT_POOL_TAG); + if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset, + entry->packetBuf + offset) == NULL) { + OVS_LOG_ERROR("Error when obtaining bytes from Packet"); + goto handle_error; + } + + /* Insert the entry in the Static Buffer */ + InsertHeadList(&OvsSttPktFragHash[hash & STT_HASH_TABLE_MASK], + &entry->link); + } else { + /* Add to recieved length to identify if this is the last fragment */ + pktFragEntry->recvdLen += fragmentLength; + lastPacket = (pktFragEntry->recvdLen == innerPacketLen); + + if (segOffset == 0) { + pktFragEntry->sttHdr = *sttHdr; + } + + /* Copy the fragment data from Source to existing buffer */ + if (OvsGetPacketBytes(curNbl, fragmentLength, startOffset, + pktFragEntry->packetBuf + offset) == NULL) { + OVS_LOG_ERROR("Error when obtaining bytes from Packet"); + goto handle_error; + } + } + +handle_error: + if (lastPacket) { + /* Retrieve the original STT header */ + NdisMoveMemory(newSttHdr, &pktFragEntry->sttHdr, sizeof (SttHdr)); + targetPNbl = OvsAllocateNBLFromBuffer(switchContext, pktFragEntry->packetBuf, + innerPacketLen); + + /* Delete this entry and free up the memory/ */ + RemoveEntryList(&pktFragEntry->link); + OvsFreeMemoryWithTag(pktFragEntry->packetBuf, OVS_STT_POOL_TAG); + OvsFreeMemoryWithTag(pktFragEntry, OVS_STT_POOL_TAG); + } + + NdisReleaseSpinLock(&OvsSttSpinLock); + return lastPacket ? targetPNbl : NULL; +} + /* * -------------------------------------------------------------------------- * OvsDecapStt -- @@ -513,34 +694,20 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, PNET_BUFFER_LIST *newNbl) { NDIS_STATUS status = NDIS_STATUS_FAILURE; - PNET_BUFFER curNb; + PNET_BUFFER curNb, newNb; IPHdr *ipHdr; char *ipBuf[sizeof(IPHdr)]; + SttHdr stt; SttHdr *sttHdr; char *sttBuf[STT_HDR_LEN]; UINT32 advanceCnt, hdrLen; - NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; + BOOLEAN isLsoPacket = FALSE; curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); ASSERT(NET_BUFFER_NEXT_NB(curNb) == NULL); - if (NET_BUFFER_DATA_LENGTH(curNb) < OvsGetSttTunHdrSize()) { - OVS_LOG_ERROR("Packet length received is less than the tunnel header:" - " %d<%d\n", NET_BUFFER_DATA_LENGTH(curNb), OvsGetSttTunHdrSize()); - return NDIS_STATUS_INVALID_LENGTH; - } - - /* Verify outer TCP Checksum */ - csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, - TcpIpChecksumNetBufferListInfo); - - /* Check if NIC has indicated TCP checksum failure */ - if (csumInfo.Receive.TcpChecksumFailed) { - return NDIS_STATUS_INVALID_PACKET; - } - - /* Calculate the TCP Checksum */ - status = OvsCalculateTCPChecksum(curNbl, curNb); + /* Validate the TCP Checksum */ + status = OvsValidateTCPChecksum(curNbl, curNb); if (status != NDIS_STATUS_SUCCESS) { return status; } @@ -554,34 +721,73 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, 1 /*no align*/, 0); ASSERT(ipHdr); + TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); + /* Skip IP & TCP headers */ hdrLen = sizeof(IPHdr) + sizeof(TCPHdr), NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); advanceCnt += hdrLen; - /* STT Header */ - sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, (PVOID) &sttBuf, - 1 /*no align*/, 0); + UINT32 seq = ntohl(tcp->seq); + UINT32 totalLen = (seq >> STT_SEQ_LEN_SHIFT); + UINT16 payloadLen = (UINT16)ntohs(ipHdr->tot_len) + - (ipHdr->ihl * 4) + - (sizeof * tcp); + + /* Check if incoming packet requires reassembly */ + if (totalLen != payloadLen) { + sttHdr = &stt; + PNET_BUFFER_LIST pNbl = OvsSttReassemble(switchContext, curNbl, + ipHdr, tcp, sttHdr, + payloadLen); + if (pNbl == NULL) { + return NDIS_STATUS_SUCCESS; + } + + *newNbl = pNbl; + isLsoPacket = TRUE; + } else { + /* STT Header */ + sttHdr = NdisGetDataBuffer(curNb, sizeof *sttHdr, + (PVOID) &sttBuf, 1 /*no align*/, 0); + /* Skip stt header, DataOffset points to inner pkt now. */ + hdrLen = STT_HDR_LEN; + NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); + advanceCnt += hdrLen; + + *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, + 0, FALSE /*copy NBL info*/); + } + + if (*newNbl == NULL) { + OVS_LOG_ERROR("Unable to allocate a new cloned NBL"); + return NDIS_STATUS_RESOURCES; + } + + status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL); + if (status != NDIS_STATUS_SUCCESS) { + OvsCompleteNBL(switchContext, *newNbl, TRUE); + return NDIS_STATUS_FAILURE; + } + newNb = NET_BUFFER_LIST_FIRST_NB(*newNbl); + ASSERT(sttHdr); /* Initialize the tunnel key */ tunKey->dst = ipHdr->daddr; tunKey->src = ipHdr->saddr; tunKey->tunnelId = sttHdr->key; - tunKey->flags = (OVS_TNL_F_CSUM | OVS_TNL_F_KEY); + tunKey->flags = OVS_TNL_F_KEY; tunKey->tos = ipHdr->tos; tunKey->ttl = ipHdr->ttl; tunKey->pad = 0; - /* Skip stt header, DataOffset points to inner pkt now. */ - hdrLen = STT_HDR_LEN; - NdisAdvanceNetBufferDataStart(curNb, hdrLen, FALSE, NULL); - advanceCnt += hdrLen; + BOOLEAN requiresLSO = sttHdr->mss != 0; /* Verify checksum for inner packet if it's required */ if (!(sttHdr->flags & STT_CSUM_VERIFIED)) { BOOLEAN innerChecksumPartial = sttHdr->flags & STT_CSUM_PARTIAL; - EthHdr *eth = (EthHdr *)NdisGetDataBuffer(curNb, sizeof(EthHdr), + EthHdr *eth = (EthHdr *)NdisGetDataBuffer(newNb, sizeof(EthHdr), NULL, 1, 0); /* XXX Figure out a way to offload checksum receives */ @@ -597,14 +803,16 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, IPPROTO_TCP, (UINT16)l4Payload); } - tcp->check = CalculateChecksumNB(curNb, l4Payload, offset); + if (!requiresLSO) { + tcp->check = CalculateChecksumNB(newNb, l4Payload, offset); + } } else if (ip->protocol == IPPROTO_UDP) { UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip); if (!innerChecksumPartial){ udp->check = IPPseudoChecksum(&ip->saddr, &ip->daddr, IPPROTO_UDP, l4Payload); } - udp->check = CalculateChecksumNB(curNb, l4Payload, offset); + udp->check = CalculateChecksumNB(newNb, l4Payload, offset); } } else if (eth->Type == ntohs(NDIS_ETH_TYPE_IPV6)) { IPv6Hdr *ip = (IPv6Hdr *)((PCHAR)eth + sizeof *eth); @@ -617,7 +825,9 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, (UINT32 *)&ip->daddr, IPPROTO_TCP, totalLength); } - tcp->check = CalculateChecksumNB(curNb, totalLength, offset); + if (!requiresLSO) { + tcp->check = CalculateChecksumNB(newNb, totalLength, offset); + } } else if (ip->nexthdr == IPPROTO_UDP) { UDPHdr *udp = (UDPHdr *)((PCHAR)ip + sizeof *ip); @@ -626,23 +836,27 @@ OvsDecapStt(POVS_SWITCH_CONTEXT switchContext, (UINT32 *)&ip->daddr, IPPROTO_UDP, totalLength); } - udp->check = CalculateChecksumNB(curNb, totalLength, offset); + udp->check = CalculateChecksumNB(newNb, totalLength, offset); } } - NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0; + NET_BUFFER_LIST_INFO(*newNbl, TcpIpChecksumNetBufferListInfo) = 0; } - *newNbl = OvsPartialCopyNBL(switchContext, curNbl, OVS_DEFAULT_COPY_SIZE, - 0, FALSE /*copy NBL info*/); - - ASSERT(advanceCnt == OvsGetSttTunHdrSize()); - status = NdisRetreatNetBufferDataStart(curNb, advanceCnt, 0, NULL); - - if (*newNbl == NULL) { - OVS_LOG_ERROR("OvsDecapStt: Unable to allocate a new cloned NBL"); - status = NDIS_STATUS_RESOURCES; + if (requiresLSO) { + NDIS_TCP_LARGE_SEND_OFFLOAD_NET_BUFFER_LIST_INFO lsoInfo; + lsoInfo.Value = 0; + lsoInfo.LsoV2Transmit.TcpHeaderOffset = sttHdr->l4Offset; + lsoInfo.LsoV2Transmit.MSS = ETH_DEFAULT_MTU - sizeof(IPHdr) - sizeof(TCPHdr); + lsoInfo.LsoV2Transmit.Type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; + if (sttHdr->flags & STT_PROTO_IPV4) { + lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv4; + } else { + lsoInfo.LsoV2Transmit.IPVersion = NDIS_TCP_LARGE_SEND_OFFLOAD_IPv6; + } + NET_BUFFER_LIST_INFO(*newNbl, + TcpLargeSendNetBufferListInfo) = lsoInfo.Value; } - return status; + return NDIS_STATUS_SUCCESS; }
Add support to STT - Encap and Decap functions to reassemble the packet fragments. Also add support to offload the packet to NDIS. Signed-off-by: Sairam Venugopal <vsairam@vmware.com> --- datapath-windows/ovsext/Actions.c | 40 ++-- datapath-windows/ovsext/Stt.c | 398 +++++++++++++++++++++++++++++--------- 2 files changed, 329 insertions(+), 109 deletions(-)