Message ID | 20180618053729.2372-3-kumaranand@vmware.com |
---|---|
State | Superseded |
Headers | show |
Series | Optimize conntrack performance | expand |
On 06/17/2018 10:37 PM, Anand Kumar wrote: > This patch primarily gets rid of NdisRWLock in conntrack for NAT > functionality along with some conntrack optimization. The subsequent > patch will have a lock implementation inside NAT module. > > - Introduce a new function OvsGetTcpHeader() to retrieve TCP header > and payload length, to optimize for TCP traffic. > - Optimize conntrack look up. > - Remove 'bucketlockRef' member from conntrack entry structure. > > Signed-off-by: Anand Kumar <kumaranand@vmware.com> > --- > datapath-windows/ovsext/Conntrack-ftp.c | 4 +- > datapath-windows/ovsext/Conntrack-tcp.c | 15 ++--- > datapath-windows/ovsext/Conntrack.c | 110 +++++++++++++------------------- > datapath-windows/ovsext/Conntrack.h | 36 +++++++---- > 4 files changed, 74 insertions(+), 91 deletions(-) > > diff --git a/datapath-windows/ovsext/Conntrack-ftp.c b/datapath-windows/ovsext/Conntrack-ftp.c > index 6830dfa..ce09a65 100644 > --- a/datapath-windows/ovsext/Conntrack-ftp.c > +++ b/datapath-windows/ovsext/Conntrack-ftp.c > @@ -129,14 +129,14 @@ OvsCtHandleFtp(PNET_BUFFER_LIST curNbl, > char temp[256] = { 0 }; > char ftpMsg[256] = { 0 }; > > + UINT32 len; > TCPHdr tcpStorage; > const TCPHdr *tcp; > - tcp = OvsGetTcp(curNbl, layers->l4Offset, &tcpStorage); > + tcp = OvsGetTcpHeader(curNbl, layers, &tcpStorage, &len); > if (!tcp) { > return NDIS_STATUS_INVALID_PACKET; > } > > - UINT32 len = OvsGetTcpPayloadLength(curNbl); > if (len > sizeof(temp)) { > /* We only care up to 256 */ > len = sizeof(temp); > diff --git a/datapath-windows/ovsext/Conntrack-tcp.c b/datapath-windows/ovsext/Conntrack-tcp.c > index 8cbab24..eda42ac 100644 > --- a/datapath-windows/ovsext/Conntrack-tcp.c > +++ b/datapath-windows/ovsext/Conntrack-tcp.c > @@ -194,9 +194,9 @@ OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY* conn) > enum CT_UPDATE_RES > OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > const TCPHdr *tcp, > - PNET_BUFFER_LIST nbl, > BOOLEAN reply, > - UINT64 now) > + UINT64 now, > + UINT32 tcpPayloadLen) > { > struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_); > /* The peer that sent 'pkt' */ > @@ -207,7 +207,6 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > UINT16 tcp_flags = ntohs(tcp->flags); > uint16_t win = ntohs(tcp->window); > uint32_t ack, end, seq, orig_seq; > - uint32_t p_len = OvsGetTcpPayloadLength(nbl); > int ackskew; > > if (OvsCtInvalidTcpFlags(tcp_flags)) { > @@ -248,7 +247,7 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > > ack = ntohl(tcp->ack_seq); > > - end = seq + p_len; > + end = seq + tcpPayloadLen; > if (tcp_flags & TCP_SYN) { > end++; > if (dst->wscale & CT_WSCALE_FLAG) { > @@ -287,7 +286,7 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > > } else { > ack = ntohl(tcp->ack_seq); > - end = seq + p_len; > + end = seq + tcpPayloadLen; > if (tcp_flags & TCP_SYN) { > end++; > } > @@ -469,8 +468,8 @@ OvsConntrackValidateTcpPacket(const TCPHdr *tcp) > > OVS_CT_ENTRY * > OvsConntrackCreateTcpEntry(const TCPHdr *tcp, > - PNET_BUFFER_LIST nbl, > - UINT64 now) > + UINT64 now, > + UINT32 tcpPayloadLen) > { > struct conn_tcp* newconn; > struct tcp_peer *src, *dst; > @@ -486,7 +485,7 @@ OvsConntrackCreateTcpEntry(const TCPHdr *tcp, > dst = &newconn->peer[1]; > > src->seqlo = ntohl(tcp->seq); > - src->seqhi = src->seqlo + OvsGetTcpPayloadLength(nbl) + 1; > + src->seqhi = src->seqlo + tcpPayloadLen + 1; > > if (tcp->flags & TCP_SYN) { > src->seqhi++; > diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c > index 7b54fba..2a85e57 100644 > --- a/datapath-windows/ovsext/Conntrack.c > +++ b/datapath-windows/ovsext/Conntrack.c > @@ -32,7 +32,6 @@ KSTART_ROUTINE OvsConntrackEntryCleaner; > static PLIST_ENTRY ovsConntrackTable; > static OVS_CT_THREAD_CTX ctThreadCtx; > static PNDIS_RW_LOCK_EX *ovsCtBucketLock = NULL; > -static PNDIS_RW_LOCK_EX ovsCtNatLockObj; > extern POVS_SWITCH_CONTEXT gOvsSwitchContext; > static ULONG ctTotalEntries; > > @@ -54,19 +53,11 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) > ctTotalEntries = 0; > UINT32 numBucketLocks = CT_HASH_TABLE_SIZE; > > - /* Init the sync-lock */ > - ovsCtNatLockObj = NdisAllocateRWLock(context->NdisFilterHandle); > - if (ovsCtNatLockObj == NULL) { > - return STATUS_INSUFFICIENT_RESOURCES; > - } > - > /* Init the Hash Buffer */ > ovsConntrackTable = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY) > * CT_HASH_TABLE_SIZE, > OVS_CT_POOL_TAG); > if (ovsConntrackTable == NULL) { > - NdisFreeRWLock(ovsCtNatLockObj); > - ovsCtNatLockObj = NULL; > return STATUS_INSUFFICIENT_RESOURCES; > } > > @@ -121,8 +112,6 @@ freeBucketLock: > freeTable: > OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); > ovsConntrackTable = NULL; > - NdisFreeRWLock(ovsCtNatLockObj); > - ovsCtNatLockObj = NULL; > return status; > } > > @@ -135,7 +124,6 @@ freeTable: > VOID > OvsCleanupConntrack(VOID) > { > - LOCK_STATE_EX lockStateNat; > ctThreadCtx.exit = 1; > KeSetEvent(&ctThreadCtx.event, 0, FALSE); > KeWaitForSingleObject(ctThreadCtx.threadObject, Executive, > @@ -160,12 +148,7 @@ OvsCleanupConntrack(VOID) > } > OvsFreeMemoryWithTag(ovsCtBucketLock, OVS_CT_POOL_TAG); > ovsCtBucketLock = NULL; > - > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > OvsNatCleanup(); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > - NdisFreeRWLock(ovsCtNatLockObj); > - ovsCtNatLockObj = NULL; > } > > static __inline VOID > @@ -244,25 +227,20 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, > if (natInfo == NULL) { > entry->natInfo.natAction = NAT_ACTION_NONE; > } else { > - LOCK_STATE_EX lockStateNat; > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > if (OvsIsForwardNat(natInfo->natAction)) { > entry->natInfo = *natInfo; > if (!OvsNatTranslateCtEntry(entry)) { > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > return FALSE; > } > ctx->hash = OvsHashCtKey(&entry->key); > } else { > entry->natInfo.natAction = natInfo->natAction; > } > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > } > > entry->timestampStart = now; > NdisAllocateSpinLock(&(entry->lock)); > UINT32 bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; > - entry->bucketLockRef = ovsCtBucketLock[bucketIdx]; > NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockState, 0); > InsertHeadList(&ovsConntrackTable[bucketIdx], > &entry->link); > @@ -275,7 +253,7 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, > static __inline POVS_CT_ENTRY > OvsCtEntryCreate(OvsForwardingContext *fwdCtx, > UINT8 ipProto, > - UINT32 l4Offset, > + OVS_PACKET_HDR_INFO *layers, > OvsConntrackKeyLookupCtx *ctx, > OvsFlowKey *key, > PNAT_ACTION_INFO natInfo, > @@ -293,16 +271,18 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, > switch (ipProto) { > case IPPROTO_TCP: > { > + UINT32 tcpPayloadLen; > TCPHdr tcpStorage; > const TCPHdr *tcp; > - tcp = OvsGetTcp(curNbl, l4Offset, &tcpStorage); > + tcp = OvsGetTcpHeader(curNbl, layers, &tcpStorage, &tcpPayloadLen); > if (!OvsConntrackValidateTcpPacket(tcp)) { > state = OVS_CS_F_INVALID; > break; > } > > if (commit) { > - entry = OvsConntrackCreateTcpEntry(tcp, curNbl, currentTime); > + entry = OvsConntrackCreateTcpEntry(tcp, currentTime, > + tcpPayloadLen); > } > break; > } > @@ -310,7 +290,7 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, > { > ICMPHdr storage; > const ICMPHdr *icmp; > - icmp = OvsGetIcmp(curNbl, l4Offset, &storage); > + icmp = OvsGetIcmp(curNbl, layers->l4Offset, &storage); > if (!OvsConntrackValidateIcmpPacket(icmp)) { > if(icmp) { > OVS_LOG_TRACE("Invalid ICMP packet detected, icmp->type %u", > @@ -371,7 +351,7 @@ static enum CT_UPDATE_RES > OvsCtUpdateEntry(OVS_CT_ENTRY* entry, > PNET_BUFFER_LIST nbl, > UINT8 ipProto, > - UINT32 l4Offset, > + OVS_PACKET_HDR_INFO *layers, > BOOLEAN reply, > UINT64 now) > { > @@ -380,15 +360,17 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, > switch (ipProto) { > case IPPROTO_TCP: > { > + UINT32 tcpPayloadLen; > TCPHdr tcpStorage; > const TCPHdr *tcp; > - tcp = OvsGetTcp(nbl, l4Offset, &tcpStorage); > + tcp = OvsGetTcpHeader(nbl, layers, &tcpStorage, &tcpPayloadLen); > if (!tcp) { > status = CT_UPDATE_INVALID; > break; > } > OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > - status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); > + status = OvsConntrackUpdateTcpEntry(entry, tcp, reply, now, > + tcpPayloadLen); > OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > break; > } > @@ -433,10 +415,7 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) > OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > if (forceDelete || OvsCtEntryExpired(entry)) { > if (entry->natInfo.natAction) { > - LOCK_STATE_EX lockStateNat; > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > OvsNatDeleteKey(&entry->key); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > } > OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); > RemoveEntryList(&entry->link); > @@ -479,15 +458,12 @@ OvsDetectCtPacket(OvsForwardingContext *fwdCtx, > } > > BOOLEAN > -OvsCtKeyAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey) > +OvsCtEndpointsAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey) > { > return ((NdisEqualMemory(&ctxKey.src, &entryKey.src, > sizeof(struct ct_endpoint))) && > (NdisEqualMemory(&ctxKey.dst, &entryKey.dst, > - sizeof(struct ct_endpoint))) && > - (ctxKey.dl_type == entryKey.dl_type) && > - (ctxKey.nw_proto == entryKey.nw_proto) && > - (ctxKey.zone == entryKey.zone)); > + sizeof(struct ct_endpoint)))); > } > > POVS_CT_ENTRY > @@ -499,6 +475,11 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > POVS_CT_ENTRY found = NULL; > LOCK_STATE_EX lockStateTable; > UINT32 bucketIdx; > + > + if (!ctTotalEntries) { > + return found; > + } > + The counter above needs protection to guard against change? > /* Reverse NAT must be performed before OvsCtLookup, so here > * we simply need to flip the src and dst in key and compare > * they are equal. Note that flipped key is not equal to > @@ -507,10 +488,6 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > OVS_CT_KEY revCtxKey = ctx->key; > OvsCtKeyReverse(&revCtxKey); > > - if (!ctTotalEntries) { > - return found; > - } > - > KIRQL irql = KeGetCurrentIrql(); > bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; > NdisAcquireRWLockRead(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); > @@ -518,12 +495,19 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); > OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > > - if (OvsCtKeyAreSame(ctx->key, entry->key)) { > + if ((ctx->key.dl_type != entry->key.dl_type) || > + (ctx->key.nw_proto != entry->key.nw_proto) || > + (ctx->key.zone != entry->key.zone)) { > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > + continue; > + } > + > + if (OvsCtEndpointsAreSame(ctx->key, entry->key)) { > found = entry; > reply = FALSE; > } > > - if (!found && OvsCtKeyAreSame(revCtxKey, entry->key)) { > + if (!found && OvsCtEndpointsAreSame(revCtxKey, entry->key)) { > found = entry; > reply = TRUE; > } > @@ -649,10 +633,7 @@ OvsCtSetupLookupCtx(OvsFlowKey *flowKey, > return NDIS_STATUS_INVALID_PACKET; > } > > - LOCK_STATE_EX lockStateNat; > - NdisAcquireRWLockRead(ovsCtNatLockObj, &lockStateNat, 0); > natEntry = OvsNatLookup(&ctx->key, TRUE); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > if (natEntry) { > /* Translate address first for reverse NAT */ > ctx->key = natEntry->ctEntry->key; > @@ -678,7 +659,7 @@ OvsDetectFtpPacket(OvsFlowKey *key) { > */ > static __inline POVS_CT_ENTRY > OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > - UINT32 l4Offset, > + OVS_PACKET_HDR_INFO *layers, > OvsConntrackKeyLookupCtx *ctx, > OvsFlowKey *key, > UINT16 zone, > @@ -691,7 +672,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > UINT32 state = 0; > PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; > LOCK_STATE_EX lockStateTable; > - PNDIS_RW_LOCK_EX bucketLockRef = NULL; > + > *entryCreated = FALSE; > > /* If an entry was found, update the state based on TCP flags */ > @@ -702,8 +683,9 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > } > } else { > CT_UPDATE_RES result; > - result = OvsCtUpdateEntry(entry, curNbl, key->ipKey.nwProto, > - l4Offset, ctx->reply, currentTime); > + UINT32 bucketIdx; > + result = OvsCtUpdateEntry(entry, curNbl, key->ipKey.nwProto, layers, > + ctx->reply, currentTime); > switch (result) { > case CT_UPDATE_VALID: > state |= OVS_CS_F_ESTABLISHED; > @@ -716,12 +698,12 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > break; > case CT_UPDATE_NEW: > //Delete and update the Conntrack > - bucketLockRef = entry->bucketLockRef; > - NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); > + bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; > + NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); > OvsCtEntryDelete(ctx->entry, TRUE); > - NdisReleaseRWLock(bucketLockRef, &lockStateTable); > + NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); > ctx->entry = NULL; > - entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, l4Offset, > + entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, layers, > ctx, key, natInfo, commit, currentTime, > entryCreated); > if (!entry) { > @@ -869,10 +851,10 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > > /* Delete entry in reverse direction if 'force' is specified */ > if (force && ctx.reply && entry) { > - PNDIS_RW_LOCK_EX bucketLockRef = entry->bucketLockRef; > - NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); > + UINT32 bucketIdx = ctx.hash & CT_HASH_TABLE_MASK; > + NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); > OvsCtEntryDelete(entry, TRUE); > - NdisReleaseRWLock(bucketLockRef, &lockStateTable); > + NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); > entry = NULL; > } > > @@ -885,7 +867,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > OvsCtIncrementCounters(entry, ctx.reply, curNbl); > } > /* Process the entry and update CT flags */ > - entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, > + entry = OvsProcessConntrackEntry(fwdCtx, layers, &ctx, key, > zone, natInfo, commit, currentTime, > &entryCreated); > > @@ -900,7 +882,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > } > /* If no matching entry was found, create one and add New state */ > entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, > - layers->l4Offset, &ctx, > + layers, &ctx, > key, natInfo, commit, currentTime, > &entryCreated); > } > @@ -918,13 +900,9 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > */ > KIRQL irql = KeGetCurrentIrql(); > OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > - if (natInfo->natAction != NAT_ACTION_NONE) > - { > - LOCK_STATE_EX lockStateNat; > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > + if (natInfo->natAction != NAT_ACTION_NONE) { > OvsNatPacket(fwdCtx, entry, entry->natInfo.natAction, > key, ctx.reply); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > } > > OvsCtSetMarkLabel(key, entry, mark, labels, &triggerUpdateEvent); > @@ -1156,7 +1134,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) > { > PLIST_ENTRY link, next; > POVS_CT_ENTRY entry; > - LOCK_STATE_EX lockState, lockStateNat; > + LOCK_STATE_EX lockState; > > if (ctTotalEntries) { > for (UINT32 i = 0; i < CT_HASH_TABLE_SIZE; i++) { > @@ -1188,9 +1166,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) > } > } > > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > OvsNatFlush(zone); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > return NDIS_STATUS_SUCCESS; > } > > diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h > index 7dc92a1..7a80eea 100644 > --- a/datapath-windows/ovsext/Conntrack.h > +++ b/datapath-windows/ovsext/Conntrack.h > @@ -99,8 +99,6 @@ typedef struct _NAT_ACTION_INFO { > } NAT_ACTION_INFO, *PNAT_ACTION_INFO; > > typedef struct OVS_CT_ENTRY { > - /* Reference to ovsCtBucketLock of ovsConntrackTable.*/ > - PNDIS_RW_LOCK_EX bucketLockRef; > NDIS_SPIN_LOCK lock; /* Protects OVS_CT_ENTRY. */ > OVS_CT_KEY key; > OVS_CT_KEY rev_key; > @@ -156,23 +154,33 @@ OvsConntrackUpdateExpiration(OVS_CT_ENTRY *ctEntry, > ctEntry->expiration = now + interval; > } > > -static __inline UINT32 > -OvsGetTcpPayloadLength(PNET_BUFFER_LIST nbl) > +static const TCPHdr* > +OvsGetTcpHeader(PNET_BUFFER_LIST nbl, > + OVS_PACKET_HDR_INFO *layers, > + VOID *storage, > + UINT32 *tcpPayloadLen) > { > IPHdr *ipHdr; > TCPHdr *tcp; > - char *ipBuf[sizeof(EthHdr) + sizeof(IPHdr) + sizeof(TCPHdr)]; > + VOID *dest = storage; > > - ipHdr = NdisGetDataBuffer(NET_BUFFER_LIST_FIRST_NB(nbl), sizeof *ipBuf, > - (PVOID)&ipBuf, 1 /*no align*/, 0); > + ipHdr = NdisGetDataBuffer(NET_BUFFER_LIST_FIRST_NB(nbl), > + layers->l4Offset + sizeof(TCPHdr), > + NULL, 1 /*no align*/, 0); > if (ipHdr == NULL) { > - return 0; > + return NULL; > } > > - ipHdr = (IPHdr *)((PCHAR)ipHdr + sizeof(EthHdr)); > + ipHdr = (IPHdr *)((PCHAR)ipHdr + layers->l3Offset); > tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); > + if (tcp->doff * 4 >= sizeof *tcp) { > + NdisMoveMemory(dest, tcp, sizeof(TCPHdr)); > + *tcpPayloadLen = ntohs((ipHdr->tot_len) - (ipHdr->ihl * 4) - > + (TCP_HDR_LEN(tcp))); > + return storage; > + } > > - return (ntohs(ipHdr->tot_len) - (ipHdr->ihl * 4) - (TCP_HDR_LEN(tcp))); > + return NULL; > } > > VOID OvsCleanupConntrack(VOID); > @@ -184,17 +192,17 @@ NDIS_STATUS OvsExecuteConntrackAction(OvsForwardingContext *fwdCtx, > BOOLEAN OvsConntrackValidateTcpPacket(const TCPHdr *tcp); > BOOLEAN OvsConntrackValidateIcmpPacket(const ICMPHdr *icmp); > OVS_CT_ENTRY * OvsConntrackCreateTcpEntry(const TCPHdr *tcp, > - PNET_BUFFER_LIST nbl, > - UINT64 now); > + UINT64 now, > + UINT32 tcpPayloadLen); > NDIS_STATUS OvsCtMapTcpProtoInfoToNl(PNL_BUFFER nlBuf, > OVS_CT_ENTRY *conn_); > OVS_CT_ENTRY * OvsConntrackCreateOtherEntry(UINT64 now); > OVS_CT_ENTRY * OvsConntrackCreateIcmpEntry(UINT64 now); > enum CT_UPDATE_RES OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > const TCPHdr *tcp, > - PNET_BUFFER_LIST nbl, > BOOLEAN reply, > - UINT64 now); > + UINT64 now, > + UINT32 tcpPayloadLen); > enum CT_UPDATE_RES OvsConntrackUpdateOtherEntry(OVS_CT_ENTRY *conn_, > BOOLEAN reply, > UINT64 now);
Hi Shashank, Thanks for the review. Please find my response inline. Thanks, Anand Kumar On 6/18/18, 11:54 AM, "Shashank Ram" <rams@vmware.com> wrote: On 06/17/2018 10:37 PM, Anand Kumar wrote: > This patch primarily gets rid of NdisRWLock in conntrack for NAT > functionality along with some conntrack optimization. The subsequent > patch will have a lock implementation inside NAT module. > > - Introduce a new function OvsGetTcpHeader() to retrieve TCP header > and payload length, to optimize for TCP traffic. > - Optimize conntrack look up. > - Remove 'bucketlockRef' member from conntrack entry structure. > > Signed-off-by: Anand Kumar <kumaranand@vmware.com> > --- > datapath-windows/ovsext/Conntrack-ftp.c | 4 +- > datapath-windows/ovsext/Conntrack-tcp.c | 15 ++--- > datapath-windows/ovsext/Conntrack.c | 110 +++++++++++++------------------- > datapath-windows/ovsext/Conntrack.h | 36 +++++++---- > 4 files changed, 74 insertions(+), 91 deletions(-) > > diff --git a/datapath-windows/ovsext/Conntrack-ftp.c b/datapath-windows/ovsext/Conntrack-ftp.c > index 6830dfa..ce09a65 100644 > --- a/datapath-windows/ovsext/Conntrack-ftp.c > +++ b/datapath-windows/ovsext/Conntrack-ftp.c > @@ -129,14 +129,14 @@ OvsCtHandleFtp(PNET_BUFFER_LIST curNbl, > char temp[256] = { 0 }; > char ftpMsg[256] = { 0 }; > > + UINT32 len; > TCPHdr tcpStorage; > const TCPHdr *tcp; > - tcp = OvsGetTcp(curNbl, layers->l4Offset, &tcpStorage); > + tcp = OvsGetTcpHeader(curNbl, layers, &tcpStorage, &len); > if (!tcp) { > return NDIS_STATUS_INVALID_PACKET; > } > > - UINT32 len = OvsGetTcpPayloadLength(curNbl); > if (len > sizeof(temp)) { > /* We only care up to 256 */ > len = sizeof(temp); > diff --git a/datapath-windows/ovsext/Conntrack-tcp.c b/datapath-windows/ovsext/Conntrack-tcp.c > index 8cbab24..eda42ac 100644 > --- a/datapath-windows/ovsext/Conntrack-tcp.c > +++ b/datapath-windows/ovsext/Conntrack-tcp.c > @@ -194,9 +194,9 @@ OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY* conn) > enum CT_UPDATE_RES > OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > const TCPHdr *tcp, > - PNET_BUFFER_LIST nbl, > BOOLEAN reply, > - UINT64 now) > + UINT64 now, > + UINT32 tcpPayloadLen) > { > struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_); > /* The peer that sent 'pkt' */ > @@ -207,7 +207,6 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > UINT16 tcp_flags = ntohs(tcp->flags); > uint16_t win = ntohs(tcp->window); > uint32_t ack, end, seq, orig_seq; > - uint32_t p_len = OvsGetTcpPayloadLength(nbl); > int ackskew; > > if (OvsCtInvalidTcpFlags(tcp_flags)) { > @@ -248,7 +247,7 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > > ack = ntohl(tcp->ack_seq); > > - end = seq + p_len; > + end = seq + tcpPayloadLen; > if (tcp_flags & TCP_SYN) { > end++; > if (dst->wscale & CT_WSCALE_FLAG) { > @@ -287,7 +286,7 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > > } else { > ack = ntohl(tcp->ack_seq); > - end = seq + p_len; > + end = seq + tcpPayloadLen; > if (tcp_flags & TCP_SYN) { > end++; > } > @@ -469,8 +468,8 @@ OvsConntrackValidateTcpPacket(const TCPHdr *tcp) > > OVS_CT_ENTRY * > OvsConntrackCreateTcpEntry(const TCPHdr *tcp, > - PNET_BUFFER_LIST nbl, > - UINT64 now) > + UINT64 now, > + UINT32 tcpPayloadLen) > { > struct conn_tcp* newconn; > struct tcp_peer *src, *dst; > @@ -486,7 +485,7 @@ OvsConntrackCreateTcpEntry(const TCPHdr *tcp, > dst = &newconn->peer[1]; > > src->seqlo = ntohl(tcp->seq); > - src->seqhi = src->seqlo + OvsGetTcpPayloadLength(nbl) + 1; > + src->seqhi = src->seqlo + tcpPayloadLen + 1; > > if (tcp->flags & TCP_SYN) { > src->seqhi++; > diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c > index 7b54fba..2a85e57 100644 > --- a/datapath-windows/ovsext/Conntrack.c > +++ b/datapath-windows/ovsext/Conntrack.c > @@ -32,7 +32,6 @@ KSTART_ROUTINE OvsConntrackEntryCleaner; > static PLIST_ENTRY ovsConntrackTable; > static OVS_CT_THREAD_CTX ctThreadCtx; > static PNDIS_RW_LOCK_EX *ovsCtBucketLock = NULL; > -static PNDIS_RW_LOCK_EX ovsCtNatLockObj; > extern POVS_SWITCH_CONTEXT gOvsSwitchContext; > static ULONG ctTotalEntries; > > @@ -54,19 +53,11 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) > ctTotalEntries = 0; > UINT32 numBucketLocks = CT_HASH_TABLE_SIZE; > > - /* Init the sync-lock */ > - ovsCtNatLockObj = NdisAllocateRWLock(context->NdisFilterHandle); > - if (ovsCtNatLockObj == NULL) { > - return STATUS_INSUFFICIENT_RESOURCES; > - } > - > /* Init the Hash Buffer */ > ovsConntrackTable = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY) > * CT_HASH_TABLE_SIZE, > OVS_CT_POOL_TAG); > if (ovsConntrackTable == NULL) { > - NdisFreeRWLock(ovsCtNatLockObj); > - ovsCtNatLockObj = NULL; > return STATUS_INSUFFICIENT_RESOURCES; > } > > @@ -121,8 +112,6 @@ freeBucketLock: > freeTable: > OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); > ovsConntrackTable = NULL; > - NdisFreeRWLock(ovsCtNatLockObj); > - ovsCtNatLockObj = NULL; > return status; > } > > @@ -135,7 +124,6 @@ freeTable: > VOID > OvsCleanupConntrack(VOID) > { > - LOCK_STATE_EX lockStateNat; > ctThreadCtx.exit = 1; > KeSetEvent(&ctThreadCtx.event, 0, FALSE); > KeWaitForSingleObject(ctThreadCtx.threadObject, Executive, > @@ -160,12 +148,7 @@ OvsCleanupConntrack(VOID) > } > OvsFreeMemoryWithTag(ovsCtBucketLock, OVS_CT_POOL_TAG); > ovsCtBucketLock = NULL; > - > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > OvsNatCleanup(); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > - NdisFreeRWLock(ovsCtNatLockObj); > - ovsCtNatLockObj = NULL; > } > > static __inline VOID > @@ -244,25 +227,20 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, > if (natInfo == NULL) { > entry->natInfo.natAction = NAT_ACTION_NONE; > } else { > - LOCK_STATE_EX lockStateNat; > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > if (OvsIsForwardNat(natInfo->natAction)) { > entry->natInfo = *natInfo; > if (!OvsNatTranslateCtEntry(entry)) { > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > return FALSE; > } > ctx->hash = OvsHashCtKey(&entry->key); > } else { > entry->natInfo.natAction = natInfo->natAction; > } > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > } > > entry->timestampStart = now; > NdisAllocateSpinLock(&(entry->lock)); > UINT32 bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; > - entry->bucketLockRef = ovsCtBucketLock[bucketIdx]; > NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockState, 0); > InsertHeadList(&ovsConntrackTable[bucketIdx], > &entry->link); > @@ -275,7 +253,7 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, > static __inline POVS_CT_ENTRY > OvsCtEntryCreate(OvsForwardingContext *fwdCtx, > UINT8 ipProto, > - UINT32 l4Offset, > + OVS_PACKET_HDR_INFO *layers, > OvsConntrackKeyLookupCtx *ctx, > OvsFlowKey *key, > PNAT_ACTION_INFO natInfo, > @@ -293,16 +271,18 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, > switch (ipProto) { > case IPPROTO_TCP: > { > + UINT32 tcpPayloadLen; > TCPHdr tcpStorage; > const TCPHdr *tcp; > - tcp = OvsGetTcp(curNbl, l4Offset, &tcpStorage); > + tcp = OvsGetTcpHeader(curNbl, layers, &tcpStorage, &tcpPayloadLen); > if (!OvsConntrackValidateTcpPacket(tcp)) { > state = OVS_CS_F_INVALID; > break; > } > > if (commit) { > - entry = OvsConntrackCreateTcpEntry(tcp, curNbl, currentTime); > + entry = OvsConntrackCreateTcpEntry(tcp, currentTime, > + tcpPayloadLen); > } > break; > } > @@ -310,7 +290,7 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, > { > ICMPHdr storage; > const ICMPHdr *icmp; > - icmp = OvsGetIcmp(curNbl, l4Offset, &storage); > + icmp = OvsGetIcmp(curNbl, layers->l4Offset, &storage); > if (!OvsConntrackValidateIcmpPacket(icmp)) { > if(icmp) { > OVS_LOG_TRACE("Invalid ICMP packet detected, icmp->type %u", > @@ -371,7 +351,7 @@ static enum CT_UPDATE_RES > OvsCtUpdateEntry(OVS_CT_ENTRY* entry, > PNET_BUFFER_LIST nbl, > UINT8 ipProto, > - UINT32 l4Offset, > + OVS_PACKET_HDR_INFO *layers, > BOOLEAN reply, > UINT64 now) > { > @@ -380,15 +360,17 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, > switch (ipProto) { > case IPPROTO_TCP: > { > + UINT32 tcpPayloadLen; > TCPHdr tcpStorage; > const TCPHdr *tcp; > - tcp = OvsGetTcp(nbl, l4Offset, &tcpStorage); > + tcp = OvsGetTcpHeader(nbl, layers, &tcpStorage, &tcpPayloadLen); > if (!tcp) { > status = CT_UPDATE_INVALID; > break; > } > OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > - status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); > + status = OvsConntrackUpdateTcpEntry(entry, tcp, reply, now, > + tcpPayloadLen); > OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > break; > } > @@ -433,10 +415,7 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) > OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > if (forceDelete || OvsCtEntryExpired(entry)) { > if (entry->natInfo.natAction) { > - LOCK_STATE_EX lockStateNat; > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > OvsNatDeleteKey(&entry->key); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > } > OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); > RemoveEntryList(&entry->link); > @@ -479,15 +458,12 @@ OvsDetectCtPacket(OvsForwardingContext *fwdCtx, > } > > BOOLEAN > -OvsCtKeyAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey) > +OvsCtEndpointsAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey) > { > return ((NdisEqualMemory(&ctxKey.src, &entryKey.src, > sizeof(struct ct_endpoint))) && > (NdisEqualMemory(&ctxKey.dst, &entryKey.dst, > - sizeof(struct ct_endpoint))) && > - (ctxKey.dl_type == entryKey.dl_type) && > - (ctxKey.nw_proto == entryKey.nw_proto) && > - (ctxKey.zone == entryKey.zone)); > + sizeof(struct ct_endpoint)))); > } > > POVS_CT_ENTRY > @@ -499,6 +475,11 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > POVS_CT_ENTRY found = NULL; > LOCK_STATE_EX lockStateTable; > UINT32 bucketIdx; > + > + if (!ctTotalEntries) { > + return found; > + } > + The counter above needs protection to guard against change? [AK]: Not needed, since we are not modifying the value. This is needed only to avoid lookup when there are no entries. > /* Reverse NAT must be performed before OvsCtLookup, so here > * we simply need to flip the src and dst in key and compare > * they are equal. Note that flipped key is not equal to > @@ -507,10 +488,6 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > OVS_CT_KEY revCtxKey = ctx->key; > OvsCtKeyReverse(&revCtxKey); > > - if (!ctTotalEntries) { > - return found; > - } > - > KIRQL irql = KeGetCurrentIrql(); > bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; > NdisAcquireRWLockRead(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); > @@ -518,12 +495,19 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); > OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > > - if (OvsCtKeyAreSame(ctx->key, entry->key)) { > + if ((ctx->key.dl_type != entry->key.dl_type) || > + (ctx->key.nw_proto != entry->key.nw_proto) || > + (ctx->key.zone != entry->key.zone)) { > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > + continue; > + } > + > + if (OvsCtEndpointsAreSame(ctx->key, entry->key)) { > found = entry; > reply = FALSE; > } > > - if (!found && OvsCtKeyAreSame(revCtxKey, entry->key)) { > + if (!found && OvsCtEndpointsAreSame(revCtxKey, entry->key)) { > found = entry; > reply = TRUE; > } > @@ -649,10 +633,7 @@ OvsCtSetupLookupCtx(OvsFlowKey *flowKey, > return NDIS_STATUS_INVALID_PACKET; > } > > - LOCK_STATE_EX lockStateNat; > - NdisAcquireRWLockRead(ovsCtNatLockObj, &lockStateNat, 0); > natEntry = OvsNatLookup(&ctx->key, TRUE); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > if (natEntry) { > /* Translate address first for reverse NAT */ > ctx->key = natEntry->ctEntry->key; > @@ -678,7 +659,7 @@ OvsDetectFtpPacket(OvsFlowKey *key) { > */ > static __inline POVS_CT_ENTRY > OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > - UINT32 l4Offset, > + OVS_PACKET_HDR_INFO *layers, > OvsConntrackKeyLookupCtx *ctx, > OvsFlowKey *key, > UINT16 zone, > @@ -691,7 +672,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > UINT32 state = 0; > PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; > LOCK_STATE_EX lockStateTable; > - PNDIS_RW_LOCK_EX bucketLockRef = NULL; > + > *entryCreated = FALSE; > > /* If an entry was found, update the state based on TCP flags */ > @@ -702,8 +683,9 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > } > } else { > CT_UPDATE_RES result; > - result = OvsCtUpdateEntry(entry, curNbl, key->ipKey.nwProto, > - l4Offset, ctx->reply, currentTime); > + UINT32 bucketIdx; > + result = OvsCtUpdateEntry(entry, curNbl, key->ipKey.nwProto, layers, > + ctx->reply, currentTime); > switch (result) { > case CT_UPDATE_VALID: > state |= OVS_CS_F_ESTABLISHED; > @@ -716,12 +698,12 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > break; > case CT_UPDATE_NEW: > //Delete and update the Conntrack > - bucketLockRef = entry->bucketLockRef; > - NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); > + bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; > + NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); > OvsCtEntryDelete(ctx->entry, TRUE); > - NdisReleaseRWLock(bucketLockRef, &lockStateTable); > + NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); > ctx->entry = NULL; > - entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, l4Offset, > + entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, layers, > ctx, key, natInfo, commit, currentTime, > entryCreated); > if (!entry) { > @@ -869,10 +851,10 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > > /* Delete entry in reverse direction if 'force' is specified */ > if (force && ctx.reply && entry) { > - PNDIS_RW_LOCK_EX bucketLockRef = entry->bucketLockRef; > - NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); > + UINT32 bucketIdx = ctx.hash & CT_HASH_TABLE_MASK; > + NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); > OvsCtEntryDelete(entry, TRUE); > - NdisReleaseRWLock(bucketLockRef, &lockStateTable); > + NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); > entry = NULL; > } > > @@ -885,7 +867,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > OvsCtIncrementCounters(entry, ctx.reply, curNbl); > } > /* Process the entry and update CT flags */ > - entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, > + entry = OvsProcessConntrackEntry(fwdCtx, layers, &ctx, key, > zone, natInfo, commit, currentTime, > &entryCreated); > > @@ -900,7 +882,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > } > /* If no matching entry was found, create one and add New state */ > entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, > - layers->l4Offset, &ctx, > + layers, &ctx, > key, natInfo, commit, currentTime, > &entryCreated); > } > @@ -918,13 +900,9 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > */ > KIRQL irql = KeGetCurrentIrql(); > OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > - if (natInfo->natAction != NAT_ACTION_NONE) > - { > - LOCK_STATE_EX lockStateNat; > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > + if (natInfo->natAction != NAT_ACTION_NONE) { > OvsNatPacket(fwdCtx, entry, entry->natInfo.natAction, > key, ctx.reply); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > } > > OvsCtSetMarkLabel(key, entry, mark, labels, &triggerUpdateEvent); > @@ -1156,7 +1134,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) > { > PLIST_ENTRY link, next; > POVS_CT_ENTRY entry; > - LOCK_STATE_EX lockState, lockStateNat; > + LOCK_STATE_EX lockState; > > if (ctTotalEntries) { > for (UINT32 i = 0; i < CT_HASH_TABLE_SIZE; i++) { > @@ -1188,9 +1166,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) > } > } > > - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); > OvsNatFlush(zone); > - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); > return NDIS_STATUS_SUCCESS; > } > > diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h > index 7dc92a1..7a80eea 100644 > --- a/datapath-windows/ovsext/Conntrack.h > +++ b/datapath-windows/ovsext/Conntrack.h > @@ -99,8 +99,6 @@ typedef struct _NAT_ACTION_INFO { > } NAT_ACTION_INFO, *PNAT_ACTION_INFO; > > typedef struct OVS_CT_ENTRY { > - /* Reference to ovsCtBucketLock of ovsConntrackTable.*/ > - PNDIS_RW_LOCK_EX bucketLockRef; > NDIS_SPIN_LOCK lock; /* Protects OVS_CT_ENTRY. */ > OVS_CT_KEY key; > OVS_CT_KEY rev_key; > @@ -156,23 +154,33 @@ OvsConntrackUpdateExpiration(OVS_CT_ENTRY *ctEntry, > ctEntry->expiration = now + interval; > } > > -static __inline UINT32 > -OvsGetTcpPayloadLength(PNET_BUFFER_LIST nbl) > +static const TCPHdr* > +OvsGetTcpHeader(PNET_BUFFER_LIST nbl, > + OVS_PACKET_HDR_INFO *layers, > + VOID *storage, > + UINT32 *tcpPayloadLen) > { > IPHdr *ipHdr; > TCPHdr *tcp; > - char *ipBuf[sizeof(EthHdr) + sizeof(IPHdr) + sizeof(TCPHdr)]; > + VOID *dest = storage; > > - ipHdr = NdisGetDataBuffer(NET_BUFFER_LIST_FIRST_NB(nbl), sizeof *ipBuf, > - (PVOID)&ipBuf, 1 /*no align*/, 0); > + ipHdr = NdisGetDataBuffer(NET_BUFFER_LIST_FIRST_NB(nbl), > + layers->l4Offset + sizeof(TCPHdr), > + NULL, 1 /*no align*/, 0); > if (ipHdr == NULL) { > - return 0; > + return NULL; > } > > - ipHdr = (IPHdr *)((PCHAR)ipHdr + sizeof(EthHdr)); > + ipHdr = (IPHdr *)((PCHAR)ipHdr + layers->l3Offset); > tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); > + if (tcp->doff * 4 >= sizeof *tcp) { > + NdisMoveMemory(dest, tcp, sizeof(TCPHdr)); > + *tcpPayloadLen = ntohs((ipHdr->tot_len) - (ipHdr->ihl * 4) - > + (TCP_HDR_LEN(tcp))); > + return storage; > + } > > - return (ntohs(ipHdr->tot_len) - (ipHdr->ihl * 4) - (TCP_HDR_LEN(tcp))); > + return NULL; > } > > VOID OvsCleanupConntrack(VOID); > @@ -184,17 +192,17 @@ NDIS_STATUS OvsExecuteConntrackAction(OvsForwardingContext *fwdCtx, > BOOLEAN OvsConntrackValidateTcpPacket(const TCPHdr *tcp); > BOOLEAN OvsConntrackValidateIcmpPacket(const ICMPHdr *icmp); > OVS_CT_ENTRY * OvsConntrackCreateTcpEntry(const TCPHdr *tcp, > - PNET_BUFFER_LIST nbl, > - UINT64 now); > + UINT64 now, > + UINT32 tcpPayloadLen); > NDIS_STATUS OvsCtMapTcpProtoInfoToNl(PNL_BUFFER nlBuf, > OVS_CT_ENTRY *conn_); > OVS_CT_ENTRY * OvsConntrackCreateOtherEntry(UINT64 now); > OVS_CT_ENTRY * OvsConntrackCreateIcmpEntry(UINT64 now); > enum CT_UPDATE_RES OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, > const TCPHdr *tcp, > - PNET_BUFFER_LIST nbl, > BOOLEAN reply, > - UINT64 now); > + UINT64 now, > + UINT32 tcpPayloadLen); > enum CT_UPDATE_RES OvsConntrackUpdateOtherEntry(OVS_CT_ENTRY *conn_, > BOOLEAN reply, > UINT64 now);
diff --git a/datapath-windows/ovsext/Conntrack-ftp.c b/datapath-windows/ovsext/Conntrack-ftp.c index 6830dfa..ce09a65 100644 --- a/datapath-windows/ovsext/Conntrack-ftp.c +++ b/datapath-windows/ovsext/Conntrack-ftp.c @@ -129,14 +129,14 @@ OvsCtHandleFtp(PNET_BUFFER_LIST curNbl, char temp[256] = { 0 }; char ftpMsg[256] = { 0 }; + UINT32 len; TCPHdr tcpStorage; const TCPHdr *tcp; - tcp = OvsGetTcp(curNbl, layers->l4Offset, &tcpStorage); + tcp = OvsGetTcpHeader(curNbl, layers, &tcpStorage, &len); if (!tcp) { return NDIS_STATUS_INVALID_PACKET; } - UINT32 len = OvsGetTcpPayloadLength(curNbl); if (len > sizeof(temp)) { /* We only care up to 256 */ len = sizeof(temp); diff --git a/datapath-windows/ovsext/Conntrack-tcp.c b/datapath-windows/ovsext/Conntrack-tcp.c index 8cbab24..eda42ac 100644 --- a/datapath-windows/ovsext/Conntrack-tcp.c +++ b/datapath-windows/ovsext/Conntrack-tcp.c @@ -194,9 +194,9 @@ OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY* conn) enum CT_UPDATE_RES OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, const TCPHdr *tcp, - PNET_BUFFER_LIST nbl, BOOLEAN reply, - UINT64 now) + UINT64 now, + UINT32 tcpPayloadLen) { struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_); /* The peer that sent 'pkt' */ @@ -207,7 +207,6 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, UINT16 tcp_flags = ntohs(tcp->flags); uint16_t win = ntohs(tcp->window); uint32_t ack, end, seq, orig_seq; - uint32_t p_len = OvsGetTcpPayloadLength(nbl); int ackskew; if (OvsCtInvalidTcpFlags(tcp_flags)) { @@ -248,7 +247,7 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, ack = ntohl(tcp->ack_seq); - end = seq + p_len; + end = seq + tcpPayloadLen; if (tcp_flags & TCP_SYN) { end++; if (dst->wscale & CT_WSCALE_FLAG) { @@ -287,7 +286,7 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, } else { ack = ntohl(tcp->ack_seq); - end = seq + p_len; + end = seq + tcpPayloadLen; if (tcp_flags & TCP_SYN) { end++; } @@ -469,8 +468,8 @@ OvsConntrackValidateTcpPacket(const TCPHdr *tcp) OVS_CT_ENTRY * OvsConntrackCreateTcpEntry(const TCPHdr *tcp, - PNET_BUFFER_LIST nbl, - UINT64 now) + UINT64 now, + UINT32 tcpPayloadLen) { struct conn_tcp* newconn; struct tcp_peer *src, *dst; @@ -486,7 +485,7 @@ OvsConntrackCreateTcpEntry(const TCPHdr *tcp, dst = &newconn->peer[1]; src->seqlo = ntohl(tcp->seq); - src->seqhi = src->seqlo + OvsGetTcpPayloadLength(nbl) + 1; + src->seqhi = src->seqlo + tcpPayloadLen + 1; if (tcp->flags & TCP_SYN) { src->seqhi++; diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c index 7b54fba..2a85e57 100644 --- a/datapath-windows/ovsext/Conntrack.c +++ b/datapath-windows/ovsext/Conntrack.c @@ -32,7 +32,6 @@ KSTART_ROUTINE OvsConntrackEntryCleaner; static PLIST_ENTRY ovsConntrackTable; static OVS_CT_THREAD_CTX ctThreadCtx; static PNDIS_RW_LOCK_EX *ovsCtBucketLock = NULL; -static PNDIS_RW_LOCK_EX ovsCtNatLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; static ULONG ctTotalEntries; @@ -54,19 +53,11 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) ctTotalEntries = 0; UINT32 numBucketLocks = CT_HASH_TABLE_SIZE; - /* Init the sync-lock */ - ovsCtNatLockObj = NdisAllocateRWLock(context->NdisFilterHandle); - if (ovsCtNatLockObj == NULL) { - return STATUS_INSUFFICIENT_RESOURCES; - } - /* Init the Hash Buffer */ ovsConntrackTable = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY) * CT_HASH_TABLE_SIZE, OVS_CT_POOL_TAG); if (ovsConntrackTable == NULL) { - NdisFreeRWLock(ovsCtNatLockObj); - ovsCtNatLockObj = NULL; return STATUS_INSUFFICIENT_RESOURCES; } @@ -121,8 +112,6 @@ freeBucketLock: freeTable: OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); ovsConntrackTable = NULL; - NdisFreeRWLock(ovsCtNatLockObj); - ovsCtNatLockObj = NULL; return status; } @@ -135,7 +124,6 @@ freeTable: VOID OvsCleanupConntrack(VOID) { - LOCK_STATE_EX lockStateNat; ctThreadCtx.exit = 1; KeSetEvent(&ctThreadCtx.event, 0, FALSE); KeWaitForSingleObject(ctThreadCtx.threadObject, Executive, @@ -160,12 +148,7 @@ OvsCleanupConntrack(VOID) } OvsFreeMemoryWithTag(ovsCtBucketLock, OVS_CT_POOL_TAG); ovsCtBucketLock = NULL; - - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatCleanup(); - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); - NdisFreeRWLock(ovsCtNatLockObj); - ovsCtNatLockObj = NULL; } static __inline VOID @@ -244,25 +227,20 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, if (natInfo == NULL) { entry->natInfo.natAction = NAT_ACTION_NONE; } else { - LOCK_STATE_EX lockStateNat; - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); if (OvsIsForwardNat(natInfo->natAction)) { entry->natInfo = *natInfo; if (!OvsNatTranslateCtEntry(entry)) { - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); return FALSE; } ctx->hash = OvsHashCtKey(&entry->key); } else { entry->natInfo.natAction = natInfo->natAction; } - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); } entry->timestampStart = now; NdisAllocateSpinLock(&(entry->lock)); UINT32 bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; - entry->bucketLockRef = ovsCtBucketLock[bucketIdx]; NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockState, 0); InsertHeadList(&ovsConntrackTable[bucketIdx], &entry->link); @@ -275,7 +253,7 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, static __inline POVS_CT_ENTRY OvsCtEntryCreate(OvsForwardingContext *fwdCtx, UINT8 ipProto, - UINT32 l4Offset, + OVS_PACKET_HDR_INFO *layers, OvsConntrackKeyLookupCtx *ctx, OvsFlowKey *key, PNAT_ACTION_INFO natInfo, @@ -293,16 +271,18 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, switch (ipProto) { case IPPROTO_TCP: { + UINT32 tcpPayloadLen; TCPHdr tcpStorage; const TCPHdr *tcp; - tcp = OvsGetTcp(curNbl, l4Offset, &tcpStorage); + tcp = OvsGetTcpHeader(curNbl, layers, &tcpStorage, &tcpPayloadLen); if (!OvsConntrackValidateTcpPacket(tcp)) { state = OVS_CS_F_INVALID; break; } if (commit) { - entry = OvsConntrackCreateTcpEntry(tcp, curNbl, currentTime); + entry = OvsConntrackCreateTcpEntry(tcp, currentTime, + tcpPayloadLen); } break; } @@ -310,7 +290,7 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, { ICMPHdr storage; const ICMPHdr *icmp; - icmp = OvsGetIcmp(curNbl, l4Offset, &storage); + icmp = OvsGetIcmp(curNbl, layers->l4Offset, &storage); if (!OvsConntrackValidateIcmpPacket(icmp)) { if(icmp) { OVS_LOG_TRACE("Invalid ICMP packet detected, icmp->type %u", @@ -371,7 +351,7 @@ static enum CT_UPDATE_RES OvsCtUpdateEntry(OVS_CT_ENTRY* entry, PNET_BUFFER_LIST nbl, UINT8 ipProto, - UINT32 l4Offset, + OVS_PACKET_HDR_INFO *layers, BOOLEAN reply, UINT64 now) { @@ -380,15 +360,17 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, switch (ipProto) { case IPPROTO_TCP: { + UINT32 tcpPayloadLen; TCPHdr tcpStorage; const TCPHdr *tcp; - tcp = OvsGetTcp(nbl, l4Offset, &tcpStorage); + tcp = OvsGetTcpHeader(nbl, layers, &tcpStorage, &tcpPayloadLen); if (!tcp) { status = CT_UPDATE_INVALID; break; } OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); - status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); + status = OvsConntrackUpdateTcpEntry(entry, tcp, reply, now, + tcpPayloadLen); OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; } @@ -433,10 +415,7 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); if (forceDelete || OvsCtEntryExpired(entry)) { if (entry->natInfo.natAction) { - LOCK_STATE_EX lockStateNat; - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatDeleteKey(&entry->key); - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); } OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); RemoveEntryList(&entry->link); @@ -479,15 +458,12 @@ OvsDetectCtPacket(OvsForwardingContext *fwdCtx, } BOOLEAN -OvsCtKeyAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey) +OvsCtEndpointsAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey) { return ((NdisEqualMemory(&ctxKey.src, &entryKey.src, sizeof(struct ct_endpoint))) && (NdisEqualMemory(&ctxKey.dst, &entryKey.dst, - sizeof(struct ct_endpoint))) && - (ctxKey.dl_type == entryKey.dl_type) && - (ctxKey.nw_proto == entryKey.nw_proto) && - (ctxKey.zone == entryKey.zone)); + sizeof(struct ct_endpoint)))); } POVS_CT_ENTRY @@ -499,6 +475,11 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) POVS_CT_ENTRY found = NULL; LOCK_STATE_EX lockStateTable; UINT32 bucketIdx; + + if (!ctTotalEntries) { + return found; + } + /* Reverse NAT must be performed before OvsCtLookup, so here * we simply need to flip the src and dst in key and compare * they are equal. Note that flipped key is not equal to @@ -507,10 +488,6 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) OVS_CT_KEY revCtxKey = ctx->key; OvsCtKeyReverse(&revCtxKey); - if (!ctTotalEntries) { - return found; - } - KIRQL irql = KeGetCurrentIrql(); bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; NdisAcquireRWLockRead(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); @@ -518,12 +495,19 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); - if (OvsCtKeyAreSame(ctx->key, entry->key)) { + if ((ctx->key.dl_type != entry->key.dl_type) || + (ctx->key.nw_proto != entry->key.nw_proto) || + (ctx->key.zone != entry->key.zone)) { + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); + continue; + } + + if (OvsCtEndpointsAreSame(ctx->key, entry->key)) { found = entry; reply = FALSE; } - if (!found && OvsCtKeyAreSame(revCtxKey, entry->key)) { + if (!found && OvsCtEndpointsAreSame(revCtxKey, entry->key)) { found = entry; reply = TRUE; } @@ -649,10 +633,7 @@ OvsCtSetupLookupCtx(OvsFlowKey *flowKey, return NDIS_STATUS_INVALID_PACKET; } - LOCK_STATE_EX lockStateNat; - NdisAcquireRWLockRead(ovsCtNatLockObj, &lockStateNat, 0); natEntry = OvsNatLookup(&ctx->key, TRUE); - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); if (natEntry) { /* Translate address first for reverse NAT */ ctx->key = natEntry->ctEntry->key; @@ -678,7 +659,7 @@ OvsDetectFtpPacket(OvsFlowKey *key) { */ static __inline POVS_CT_ENTRY OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, - UINT32 l4Offset, + OVS_PACKET_HDR_INFO *layers, OvsConntrackKeyLookupCtx *ctx, OvsFlowKey *key, UINT16 zone, @@ -691,7 +672,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, UINT32 state = 0; PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; LOCK_STATE_EX lockStateTable; - PNDIS_RW_LOCK_EX bucketLockRef = NULL; + *entryCreated = FALSE; /* If an entry was found, update the state based on TCP flags */ @@ -702,8 +683,9 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, } } else { CT_UPDATE_RES result; - result = OvsCtUpdateEntry(entry, curNbl, key->ipKey.nwProto, - l4Offset, ctx->reply, currentTime); + UINT32 bucketIdx; + result = OvsCtUpdateEntry(entry, curNbl, key->ipKey.nwProto, layers, + ctx->reply, currentTime); switch (result) { case CT_UPDATE_VALID: state |= OVS_CS_F_ESTABLISHED; @@ -716,12 +698,12 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, break; case CT_UPDATE_NEW: //Delete and update the Conntrack - bucketLockRef = entry->bucketLockRef; - NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); + bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; + NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); OvsCtEntryDelete(ctx->entry, TRUE); - NdisReleaseRWLock(bucketLockRef, &lockStateTable); + NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); ctx->entry = NULL; - entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, l4Offset, + entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, layers, ctx, key, natInfo, commit, currentTime, entryCreated); if (!entry) { @@ -869,10 +851,10 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, /* Delete entry in reverse direction if 'force' is specified */ if (force && ctx.reply && entry) { - PNDIS_RW_LOCK_EX bucketLockRef = entry->bucketLockRef; - NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); + UINT32 bucketIdx = ctx.hash & CT_HASH_TABLE_MASK; + NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); OvsCtEntryDelete(entry, TRUE); - NdisReleaseRWLock(bucketLockRef, &lockStateTable); + NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); entry = NULL; } @@ -885,7 +867,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, OvsCtIncrementCounters(entry, ctx.reply, curNbl); } /* Process the entry and update CT flags */ - entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, + entry = OvsProcessConntrackEntry(fwdCtx, layers, &ctx, key, zone, natInfo, commit, currentTime, &entryCreated); @@ -900,7 +882,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, } /* If no matching entry was found, create one and add New state */ entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, - layers->l4Offset, &ctx, + layers, &ctx, key, natInfo, commit, currentTime, &entryCreated); } @@ -918,13 +900,9 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, */ KIRQL irql = KeGetCurrentIrql(); OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); - if (natInfo->natAction != NAT_ACTION_NONE) - { - LOCK_STATE_EX lockStateNat; - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); + if (natInfo->natAction != NAT_ACTION_NONE) { OvsNatPacket(fwdCtx, entry, entry->natInfo.natAction, key, ctx.reply); - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); } OvsCtSetMarkLabel(key, entry, mark, labels, &triggerUpdateEvent); @@ -1156,7 +1134,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) { PLIST_ENTRY link, next; POVS_CT_ENTRY entry; - LOCK_STATE_EX lockState, lockStateNat; + LOCK_STATE_EX lockState; if (ctTotalEntries) { for (UINT32 i = 0; i < CT_HASH_TABLE_SIZE; i++) { @@ -1188,9 +1166,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) } } - NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatFlush(zone); - NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); return NDIS_STATUS_SUCCESS; } diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h index 7dc92a1..7a80eea 100644 --- a/datapath-windows/ovsext/Conntrack.h +++ b/datapath-windows/ovsext/Conntrack.h @@ -99,8 +99,6 @@ typedef struct _NAT_ACTION_INFO { } NAT_ACTION_INFO, *PNAT_ACTION_INFO; typedef struct OVS_CT_ENTRY { - /* Reference to ovsCtBucketLock of ovsConntrackTable.*/ - PNDIS_RW_LOCK_EX bucketLockRef; NDIS_SPIN_LOCK lock; /* Protects OVS_CT_ENTRY. */ OVS_CT_KEY key; OVS_CT_KEY rev_key; @@ -156,23 +154,33 @@ OvsConntrackUpdateExpiration(OVS_CT_ENTRY *ctEntry, ctEntry->expiration = now + interval; } -static __inline UINT32 -OvsGetTcpPayloadLength(PNET_BUFFER_LIST nbl) +static const TCPHdr* +OvsGetTcpHeader(PNET_BUFFER_LIST nbl, + OVS_PACKET_HDR_INFO *layers, + VOID *storage, + UINT32 *tcpPayloadLen) { IPHdr *ipHdr; TCPHdr *tcp; - char *ipBuf[sizeof(EthHdr) + sizeof(IPHdr) + sizeof(TCPHdr)]; + VOID *dest = storage; - ipHdr = NdisGetDataBuffer(NET_BUFFER_LIST_FIRST_NB(nbl), sizeof *ipBuf, - (PVOID)&ipBuf, 1 /*no align*/, 0); + ipHdr = NdisGetDataBuffer(NET_BUFFER_LIST_FIRST_NB(nbl), + layers->l4Offset + sizeof(TCPHdr), + NULL, 1 /*no align*/, 0); if (ipHdr == NULL) { - return 0; + return NULL; } - ipHdr = (IPHdr *)((PCHAR)ipHdr + sizeof(EthHdr)); + ipHdr = (IPHdr *)((PCHAR)ipHdr + layers->l3Offset); tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); + if (tcp->doff * 4 >= sizeof *tcp) { + NdisMoveMemory(dest, tcp, sizeof(TCPHdr)); + *tcpPayloadLen = ntohs((ipHdr->tot_len) - (ipHdr->ihl * 4) - + (TCP_HDR_LEN(tcp))); + return storage; + } - return (ntohs(ipHdr->tot_len) - (ipHdr->ihl * 4) - (TCP_HDR_LEN(tcp))); + return NULL; } VOID OvsCleanupConntrack(VOID); @@ -184,17 +192,17 @@ NDIS_STATUS OvsExecuteConntrackAction(OvsForwardingContext *fwdCtx, BOOLEAN OvsConntrackValidateTcpPacket(const TCPHdr *tcp); BOOLEAN OvsConntrackValidateIcmpPacket(const ICMPHdr *icmp); OVS_CT_ENTRY * OvsConntrackCreateTcpEntry(const TCPHdr *tcp, - PNET_BUFFER_LIST nbl, - UINT64 now); + UINT64 now, + UINT32 tcpPayloadLen); NDIS_STATUS OvsCtMapTcpProtoInfoToNl(PNL_BUFFER nlBuf, OVS_CT_ENTRY *conn_); OVS_CT_ENTRY * OvsConntrackCreateOtherEntry(UINT64 now); OVS_CT_ENTRY * OvsConntrackCreateIcmpEntry(UINT64 now); enum CT_UPDATE_RES OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, const TCPHdr *tcp, - PNET_BUFFER_LIST nbl, BOOLEAN reply, - UINT64 now); + UINT64 now, + UINT32 tcpPayloadLen); enum CT_UPDATE_RES OvsConntrackUpdateOtherEntry(OVS_CT_ENTRY *conn_, BOOLEAN reply, UINT64 now);
This patch primarily gets rid of NdisRWLock in conntrack for NAT functionality along with some conntrack optimization. The subsequent patch will have a lock implementation inside NAT module. - Introduce a new function OvsGetTcpHeader() to retrieve TCP header and payload length, to optimize for TCP traffic. - Optimize conntrack look up. - Remove 'bucketlockRef' member from conntrack entry structure. Signed-off-by: Anand Kumar <kumaranand@vmware.com> --- datapath-windows/ovsext/Conntrack-ftp.c | 4 +- datapath-windows/ovsext/Conntrack-tcp.c | 15 ++--- datapath-windows/ovsext/Conntrack.c | 110 +++++++++++++------------------- datapath-windows/ovsext/Conntrack.h | 36 +++++++---- 4 files changed, 74 insertions(+), 91 deletions(-)