Message ID | 20180618053729.2372-2-kumaranand@vmware.com |
---|---|
State | Superseded |
Headers | show |
Series | Optimize conntrack performance | expand |
On 06/17/2018 10:37 PM, Anand Kumar wrote: > This patch mainly changes a ndis RW lock for conntrack entry to a > spinlock along with some minor refactor in conntrack. Using > spinlock instead of RW lock as RW locks causes performance hits > when acquired/released multiple times. > > - Use NdisInterlockedXX wrapper api's instead of InterlockedXX. > - Update 'ctTotalRelatedEntries' using interlocked functions. > - Move conntrack lock out of NAT module. > > Signed-off-by: Anand Kumar <kumaranand@vmware.com> > --- > datapath-windows/ovsext/Conntrack-nat.c | 7 +- > datapath-windows/ovsext/Conntrack-related.c | 17 ++-- > datapath-windows/ovsext/Conntrack.c | 134 ++++++++++++++-------------- > datapath-windows/ovsext/Conntrack.h | 2 +- > datapath-windows/ovsext/Util.h | 18 ++++ > 5 files changed, 93 insertions(+), 85 deletions(-) > > diff --git a/datapath-windows/ovsext/Conntrack-nat.c b/datapath-windows/ovsext/Conntrack-nat.c > index 316c946..da1814f 100644 > --- a/datapath-windows/ovsext/Conntrack-nat.c > +++ b/datapath-windows/ovsext/Conntrack-nat.c > @@ -167,16 +167,13 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, > { > UINT32 natFlag; > const struct ct_endpoint* endpoint; > - LOCK_STATE_EX lockState; > - /* XXX: Move conntrack locks out of NAT after implementing lock in NAT. */ > - NdisAcquireRWLockRead(entry->lock, &lockState, 0); > + > /* When it is NAT, only entry->rev_key contains NATTED address; > When it is unNAT, only entry->key contains the UNNATTED address;*/ > const OVS_CT_KEY *ctKey = reverse ? &entry->key : &entry->rev_key; > BOOLEAN isSrcNat; > > if (!(natAction & (NAT_ACTION_SRC | NAT_ACTION_DST))) { > - NdisReleaseRWLock(entry->lock, &lockState); > return; > } > isSrcNat = (((natAction & NAT_ACTION_SRC) && !reverse) || > @@ -206,7 +203,6 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, > } > } else if (ctKey->dl_type == htons(ETH_TYPE_IPV6)){ > // XXX: IPv6 packet not supported yet. > - NdisReleaseRWLock(entry->lock, &lockState); > return; > } > if (natAction & (NAT_ACTION_SRC_PORT | NAT_ACTION_DST_PORT)) { > @@ -220,7 +216,6 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, > } > } > } > - NdisReleaseRWLock(entry->lock, &lockState); > } > > > diff --git a/datapath-windows/ovsext/Conntrack-related.c b/datapath-windows/ovsext/Conntrack-related.c > index ec4b536..b798137 100644 > --- a/datapath-windows/ovsext/Conntrack-related.c > +++ b/datapath-windows/ovsext/Conntrack-related.c > @@ -18,7 +18,7 @@ > #include "Jhash.h" > > static PLIST_ENTRY ovsCtRelatedTable; /* Holds related entries */ > -static UINT64 ctTotalRelatedEntries; > +static ULONG ctTotalRelatedEntries; > static OVS_CT_THREAD_CTX ctRelThreadCtx; > static PNDIS_RW_LOCK_EX ovsCtRelatedLockObj; > extern POVS_SWITCH_CONTEXT gOvsSwitchContext; > @@ -75,13 +75,11 @@ OvsCtRelatedLookup(OVS_CT_KEY key, UINT64 currentTime) > POVS_CT_REL_ENTRY entry; > LOCK_STATE_EX lockState; > > - NdisAcquireRWLockRead(ovsCtRelatedLockObj, &lockState, 0); > - > if (!ctTotalRelatedEntries) { > - NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); > return NULL; > } > > + NdisAcquireRWLockRead(ovsCtRelatedLockObj, &lockState, 0); > for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { > /* XXX - Scan the table based on the hash instead */ > LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { > @@ -103,7 +101,7 @@ OvsCtRelatedEntryDelete(POVS_CT_REL_ENTRY entry) > { > RemoveEntryList(&entry->link); > OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); > - ctTotalRelatedEntries--; > + NdisInterlockedDecrement((PLONG)&ctTotalRelatedEntries); > } > > NDIS_STATUS > @@ -139,7 +137,7 @@ OvsCtRelatedEntryCreate(UINT8 ipProto, > NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); > InsertHeadList(&ovsCtRelatedTable[hash & CT_HASH_TABLE_MASK], > &entry->link); > - ctTotalRelatedEntries++; > + NdisInterlockedIncrement((PLONG)&ctTotalRelatedEntries); > NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); > > return NDIS_STATUS_SUCCESS; > @@ -150,11 +148,10 @@ OvsCtRelatedFlush() > { > PLIST_ENTRY link, next; > POVS_CT_REL_ENTRY entry; > - > LOCK_STATE_EX lockState; > - NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); > > if (ctTotalRelatedEntries) { > + NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); > for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { > LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { > entry = CONTAINING_RECORD(link, OVS_CT_REL_ENTRY, link); > @@ -189,9 +186,8 @@ OvsCtRelatedEntryCleaner(PVOID data) > /* Lock has been freed by 'OvsCleanupCtRelated()' */ > break; > } > - NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); > + > if (context->exit) { > - NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); > break; > } > > @@ -201,6 +197,7 @@ OvsCtRelatedEntryCleaner(PVOID data) > threadSleepTimeout = currentTime + CT_CLEANUP_INTERVAL; > > if (ctTotalRelatedEntries) { > + NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); > for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { > LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { > entry = CONTAINING_RECORD(link, OVS_CT_REL_ENTRY, link); > diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c > index add1491..7b54fba 100644 > --- a/datapath-windows/ovsext/Conntrack.c > +++ b/datapath-windows/ovsext/Conntrack.c > @@ -34,7 +34,7 @@ static OVS_CT_THREAD_CTX ctThreadCtx; > static PNDIS_RW_LOCK_EX *ovsCtBucketLock = NULL; > static PNDIS_RW_LOCK_EX ovsCtNatLockObj; > extern POVS_SWITCH_CONTEXT gOvsSwitchContext; > -static LONG ctTotalEntries; > +static ULONG ctTotalEntries; > > static __inline OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple); > static __inline NDIS_STATUS > @@ -208,7 +208,6 @@ OvsPostCtEventEntry(POVS_CT_ENTRY entry, UINT8 type) > { > OVS_CT_EVENT_ENTRY ctEventEntry = {0}; > NdisMoveMemory(&ctEventEntry.entry, entry, sizeof(OVS_CT_ENTRY)); > - ctEventEntry.entry.lock = NULL; > ctEventEntry.entry.parent = NULL; > ctEventEntry.type = type; > OvsPostCtEvent(&ctEventEntry); > @@ -217,8 +216,8 @@ OvsPostCtEventEntry(POVS_CT_ENTRY entry, UINT8 type) > static __inline VOID > OvsCtIncrementCounters(POVS_CT_ENTRY entry, BOOLEAN reply, PNET_BUFFER_LIST nbl) > { > - LOCK_STATE_EX lockState; > - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); > + KIRQL irql = KeGetCurrentIrql(); > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > if (reply) { > entry->rev_key.byteCount+= OvsPacketLenNBL(nbl); > entry->rev_key.packetCount++; > @@ -226,11 +225,11 @@ OvsCtIncrementCounters(POVS_CT_ENTRY entry, BOOLEAN reply, PNET_BUFFER_LIST nbl) > entry->key.byteCount += OvsPacketLenNBL(nbl); > entry->key.packetCount++; > } > - NdisReleaseRWLock(entry->lock, &lockState); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); Can we simply use NdisInterlocked functions instead of having a lock to perform increments? Also, in case the lock is only being acquired once within a function, there is no need to calculate the dispatch level, let NDIS do it. Calculating the dispatch level helps only in cases where the locks are being acquired multiple times within a given context. > } > > static __inline BOOLEAN > -OvsCtAddEntry(POVS_SWITCH_CONTEXT switchContext, POVS_CT_ENTRY entry, > +OvsCtAddEntry(POVS_CT_ENTRY entry, > OvsConntrackKeyLookupCtx *ctx, > PNAT_ACTION_INFO natInfo, UINT64 now) > { > @@ -261,18 +260,14 @@ OvsCtAddEntry(POVS_SWITCH_CONTEXT switchContext, POVS_CT_ENTRY entry, > } > > entry->timestampStart = now; > - entry->lock = NdisAllocateRWLock(switchContext->NdisFilterHandle); > - if (entry->lock == NULL) { > - OVS_LOG_ERROR("NdisAllocateRWLock failed : Insufficient resources"); > - return FALSE; > - } > + NdisAllocateSpinLock(&(entry->lock)); > UINT32 bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; > entry->bucketLockRef = ovsCtBucketLock[bucketIdx]; > NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockState, 0); > InsertHeadList(&ovsConntrackTable[bucketIdx], > &entry->link); > > - InterlockedIncrement((LONG volatile *)&ctTotalEntries); > + NdisInterlockedIncrement((PLONG)&ctTotalEntries); > NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockState); > return TRUE; > } > @@ -351,8 +346,7 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, > if (state != OVS_CS_F_INVALID && commit) { > if (entry) { > entry->parent = parentEntry; > - if (OvsCtAddEntry(fwdCtx->switchContext, entry, ctx, > - natInfo, currentTime)) { > + if (OvsCtAddEntry(entry, ctx, natInfo, currentTime)) { > *entryCreated = TRUE; > } else { > /* Unable to add entry to the list */ > @@ -382,8 +376,7 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, > UINT64 now) > { > CT_UPDATE_RES status; > - LOCK_STATE_EX lockState; > - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); > + KIRQL irql = KeGetCurrentIrql(); > switch (ipProto) { > case IPPROTO_TCP: > { > @@ -394,20 +387,25 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, > status = CT_UPDATE_INVALID; > break; > } > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > break; > } Could you fix the style here for consistency by removing the braces for the case above. > case IPPROTO_ICMP: > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > status = OvsConntrackUpdateIcmpEntry(entry, reply, now); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > break; > case IPPROTO_UDP: > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > status = OvsConntrackUpdateOtherEntry(entry, reply, now); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > break; > default: > status = CT_UPDATE_INVALID; > break; > } > - NdisReleaseRWLock(entry->lock, &lockState); > return status; > } > > @@ -431,8 +429,8 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) > if (entry == NULL) { > return; > } > - LOCK_STATE_EX lockState; > - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); > + KIRQL irql = KeGetCurrentIrql(); > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > if (forceDelete || OvsCtEntryExpired(entry)) { > if (entry->natInfo.natAction) { > LOCK_STATE_EX lockStateNat; > @@ -442,13 +440,13 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) > } > OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); > RemoveEntryList(&entry->link); > - NdisReleaseRWLock(entry->lock, &lockState); > - NdisFreeRWLock(entry->lock); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > + NdisFreeSpinLock(&(entry->lock)); > OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); > - InterlockedDecrement((LONG volatile*)&ctTotalEntries); > + NdisInterlockedDecrement((PLONG)&ctTotalEntries); > return; > } > - NdisReleaseRWLock(entry->lock, &lockState); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); Why is the entry lock not being freed here? > } > > static __inline NDIS_STATUS > @@ -499,7 +497,7 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > POVS_CT_ENTRY entry; > BOOLEAN reply = FALSE; > POVS_CT_ENTRY found = NULL; > - LOCK_STATE_EX lockState, lockStateTable; > + LOCK_STATE_EX lockStateTable; > UINT32 bucketIdx; > /* Reverse NAT must be performed before OvsCtLookup, so here > * we simply need to flip the src and dst in key and compare > @@ -512,11 +510,14 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > if (!ctTotalEntries) { > return found; > } > + > + KIRQL irql = KeGetCurrentIrql(); > bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; > NdisAcquireRWLockRead(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); > LIST_FORALL(&ovsConntrackTable[bucketIdx], link) { > entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); > - NdisAcquireRWLockRead(entry->lock, &lockState, 0); > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > + > if (OvsCtKeyAreSame(ctx->key, entry->key)) { > found = entry; > reply = FALSE; > @@ -533,10 +534,10 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) > } else { > ctx->reply = reply; > } > - NdisReleaseRWLock(entry->lock, &lockState); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > break; > } > - NdisReleaseRWLock(entry->lock, &lockState); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > } > > NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); > @@ -689,7 +690,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > POVS_CT_ENTRY entry = ctx->entry; > UINT32 state = 0; > PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; > - LOCK_STATE_EX lockState, lockStateTable; > + LOCK_STATE_EX lockStateTable; > PNDIS_RW_LOCK_EX bucketLockRef = NULL; > *entryCreated = FALSE; > > @@ -730,7 +731,8 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > } > } > if (entry) { > - NdisAcquireRWLockRead(entry->lock, &lockState, 0); > + KIRQL irql = KeGetCurrentIrql(); > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); No need to figure out IRQL every time. > if (key->ipKey.nwProto == IPPROTO_TCP) { > /* Update the related bit if there is a parent */ > if (entry->parent) { > @@ -748,7 +750,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, > /* Copy mark and label from entry into flowKey. If actions specify > different mark and label, update the flowKey. */ > OvsCtUpdateFlowKey(key, state, zone, entry->mark, &entry->labels); > - NdisReleaseRWLock(entry->lock, &lockState); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > } else { > OvsCtUpdateFlowKey(key, state, zone, 0, NULL); > } > @@ -802,8 +804,6 @@ OvsCtSetMarkLabel(OvsFlowKey *key, > MD_LABELS *labels, > BOOLEAN *triggerUpdateEvent) > { > - LOCK_STATE_EX lockState; > - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); > if (mark) { > OvsConntrackSetMark(key, entry, mark->value, mark->mask, > triggerUpdateEvent); > @@ -813,7 +813,6 @@ OvsCtSetMarkLabel(OvsFlowKey *key, > OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask, > triggerUpdateEvent); > } > - NdisReleaseRWLock(entry->lock, &lockState); > } > > /* > @@ -854,10 +853,11 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > { > NDIS_STATUS status = NDIS_STATUS_SUCCESS; > BOOLEAN triggerUpdateEvent = FALSE; > + BOOLEAN entryCreated = FALSE; > POVS_CT_ENTRY entry = NULL; > PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; > OvsConntrackKeyLookupCtx ctx = { 0 }; > - LOCK_STATE_EX lockState, lockStateTable; > + LOCK_STATE_EX lockStateTable; > UINT64 currentTime; > NdisGetCurrentSystemTime((LARGE_INTEGER *) ¤tTime); > > @@ -866,10 +866,9 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > > /* Lookup Conntrack entries for a matching entry */ > entry = OvsCtLookup(&ctx); > - BOOLEAN entryCreated = FALSE; > > /* Delete entry in reverse direction if 'force' is specified */ > - if (entry && force && ctx.reply) { > + if (force && ctx.reply && entry) { > PNDIS_RW_LOCK_EX bucketLockRef = entry->bucketLockRef; > NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); > OvsCtEntryDelete(entry, TRUE); > @@ -877,34 +876,33 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > entry = NULL; > } > > - if (!entry && commit && ctTotalEntries >= CT_MAX_ENTRIES) { > - /* Don't proceed with processing if the max limit has been hit. > - * This blocks only new entries from being created and doesn't > - * affect existing connections. > + if (entry) { > + /* Increment stats for the entry if it wasn't tracked previously or > + * if they are on different zones > */ > - OVS_LOG_ERROR("Conntrack Limit hit: %lu", ctTotalEntries); > - return NDIS_STATUS_RESOURCES; > - } > - > - /* Increment stats for the entry if it wasn't tracked previously or > - * if they are on different zones > - */ > - if (entry && (entry->key.zone != key->ct.zone || > - (!(key->ct.state & OVS_CS_F_TRACKED)))) { > - OvsCtIncrementCounters(entry, ctx.reply, curNbl); > - } > + if ((entry->key.zone != key->ct.zone || > + (!(key->ct.state & OVS_CS_F_TRACKED)))) { > + OvsCtIncrementCounters(entry, ctx.reply, curNbl); > + } > + /* Process the entry and update CT flags */ > + entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, > + zone, natInfo, commit, currentTime, > + &entryCreated); > > - if (!entry) { > + } else { > + if (commit && ctTotalEntries >= CT_MAX_ENTRIES) { > + /* Don't proceed with processing if the max limit has been hit. > + * This blocks only new entries from being created and doesn't > + * affect existing connections. > + */ > + OVS_LOG_ERROR("Conntrack Limit hit: %lu", ctTotalEntries); > + return NDIS_STATUS_RESOURCES; > + } > /* If no matching entry was found, create one and add New state */ > entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, > layers->l4Offset, &ctx, > key, natInfo, commit, currentTime, > &entryCreated); > - } else { > - /* Process the entry and update CT flags */ > - entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, > - zone, natInfo, commit, currentTime, > - &entryCreated); > } > > if (entry == NULL) { > @@ -918,6 +916,8 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > * NAT_ACTION_REVERSE, yet entry->natInfo is NAT_ACTION_SRC or > * NAT_ACTION_DST without NAT_ACTION_REVERSE > */ > + KIRQL irql = KeGetCurrentIrql(); > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > if (natInfo->natAction != NAT_ACTION_NONE) > { > LOCK_STATE_EX lockStateNat; > @@ -939,15 +939,14 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > OVS_LOG_ERROR("Error while parsing the FTP packet"); > } > } > - NdisAcquireRWLockRead(entry->lock, &lockState, 0); > + > /* Add original tuple information to flow Key */ > if (entry->key.dl_type == ntohs(ETH_TYPE_IPV4)) { > if (entry->parent != NULL) { > POVS_CT_ENTRY parent = entry->parent; > - LOCK_STATE_EX lockStateParent; > - NdisAcquireRWLockRead(parent->lock, &lockStateParent, 0); > + OVS_ACQUIRE_SPIN_LOCK(&(parent->lock), irql); > OvsCtUpdateTuple(key, &parent->key); > - NdisReleaseRWLock(parent->lock, &lockStateParent); > + OVS_RELEASE_SPIN_LOCK(&(parent->lock), irql); > } else { > OvsCtUpdateTuple(key, &entry->key); > } > @@ -955,13 +954,11 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, > > if (entryCreated) { > OvsPostCtEventEntry(entry, OVS_EVENT_CT_NEW); > - } > - if (postUpdateEvent && !entryCreated && triggerUpdateEvent) { > + } else if (postUpdateEvent && triggerUpdateEvent) { > OvsPostCtEventEntry(entry, OVS_EVENT_CT_UPDATE); > } > > - NdisReleaseRWLock(entry->lock, &lockState); > - > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > return status; > } > > @@ -1738,8 +1735,9 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, > UINT32 inIndex = instance->dumpState.index[1]; > UINT32 i = CT_HASH_TABLE_SIZE; > UINT32 outIndex = 0; > + KIRQL irql = KeGetCurrentIrql(); > + LOCK_STATE_EX lockStateTable; > > - LOCK_STATE_EX lockState, lockStateTable; > if (ctTotalEntries) { > for (i = inBucket; i < CT_HASH_TABLE_SIZE; i++) { > PLIST_ENTRY head, link; > @@ -1756,7 +1754,7 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, > */ > if (outIndex >= inIndex) { > entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); > - NdisAcquireRWLockRead(entry->lock, &lockState, 0); > + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); > rc = OvsCreateNlMsgFromCtEntry(entry, > usrParamsCtx->outputBuffer, > usrParamsCtx->outputLength, > @@ -1765,7 +1763,7 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, > msgIn->nlMsg.nlmsgPid, > msgIn->nfGenMsg.version, > 0); > - NdisReleaseRWLock(entry->lock, &lockState); > + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); > if (rc != NDIS_STATUS_SUCCESS) { > NdisReleaseRWLock(ovsCtBucketLock[i], &lockStateTable); > return STATUS_UNSUCCESSFUL; > diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h > index 3be309e..7dc92a1 100644 > --- a/datapath-windows/ovsext/Conntrack.h > +++ b/datapath-windows/ovsext/Conntrack.h > @@ -101,7 +101,7 @@ typedef struct _NAT_ACTION_INFO { > typedef struct OVS_CT_ENTRY { > /* Reference to ovsCtBucketLock of ovsConntrackTable.*/ > PNDIS_RW_LOCK_EX bucketLockRef; > - PNDIS_RW_LOCK_EX lock; /* Protects OVS_CT_ENTRY. */ > + NDIS_SPIN_LOCK lock; /* Protects OVS_CT_ENTRY. */ > OVS_CT_KEY key; > OVS_CT_KEY rev_key; > UINT64 expiration; > diff --git a/datapath-windows/ovsext/Util.h b/datapath-windows/ovsext/Util.h > index 6f02147..a9bccf3 100644 > --- a/datapath-windows/ovsext/Util.h > +++ b/datapath-windows/ovsext/Util.h > @@ -105,6 +105,24 @@ VOID OvsAppendList(PLIST_ENTRY dst, PLIST_ENTRY src); > #define BIT16(_x) ((UINT16)0x1 << (_x)) > #define BIT32(_x) ((UINT32)0x1 << (_x)) > > +#define OVS_ACQUIRE_SPIN_LOCK(_pLock, _dispatchLevel) \ > +{ \ > + if (_dispatchLevel) { \ > + NdisDprAcquireSpinLock(_pLock); \ > + } else { \ > + NdisAcquireSpinLock(_pLock); \ > + } \ > +} > + > +#define OVS_RELEASE_SPIN_LOCK(_pLock, _dispatchLevel) \ > +{ \ > + if (_dispatchLevel) { \ > + NdisDprReleaseSpinLock(_pLock); \ > + } else { \ > + NdisReleaseSpinLock(_pLock); \ > + } \ > +} > + > BOOLEAN OvsCompareString(PVOID string1, PVOID string2); > > /*
Hi Shashank, Thanks for the review. Please find my response inline. Thanks, Anand Kumar From: Shashank Ram <rams@vmware.com> Date: Monday, June 18, 2018 at 11:27 AM To: Anand Kumar <kumaranand@vmware.com>, "dev@openvswitch.org" <dev@openvswitch.org> Subject: Re: [ovs-dev] [PATCH v4 1/4] datapath-windows: Use spinlock instead of RW lock for ct entry On 06/17/2018 10:37 PM, Anand Kumar wrote: This patch mainly changes a ndis RW lock for conntrack entry to a spinlock along with some minor refactor in conntrack. Using spinlock instead of RW lock as RW locks causes performance hits when acquired/released multiple times. - Use NdisInterlockedXX wrapper api's instead of InterlockedXX. - Update 'ctTotalRelatedEntries' using interlocked functions. - Move conntrack lock out of NAT module. Signed-off-by: Anand Kumar <kumaranand@vmware.com><mailto:kumaranand@vmware.com> --- datapath-windows/ovsext/Conntrack-nat.c | 7 +- datapath-windows/ovsext/Conntrack-related.c | 17 ++-- datapath-windows/ovsext/Conntrack.c | 134 ++++++++++++++-------------- datapath-windows/ovsext/Conntrack.h | 2 +- datapath-windows/ovsext/Util.h | 18 ++++ 5 files changed, 93 insertions(+), 85 deletions(-) diff --git a/datapath-windows/ovsext/Conntrack-nat.c b/datapath-windows/ovsext/Conntrack-nat.c index 316c946..da1814f 100644 --- a/datapath-windows/ovsext/Conntrack-nat.c +++ b/datapath-windows/ovsext/Conntrack-nat.c @@ -167,16 +167,13 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, { UINT32 natFlag; const struct ct_endpoint* endpoint; - LOCK_STATE_EX lockState; - /* XXX: Move conntrack locks out of NAT after implementing lock in NAT. */ - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + /* When it is NAT, only entry->rev_key contains NATTED address; When it is unNAT, only entry->key contains the UNNATTED address;*/ const OVS_CT_KEY *ctKey = reverse ? &entry->key : &entry->rev_key; BOOLEAN isSrcNat; if (!(natAction & (NAT_ACTION_SRC | NAT_ACTION_DST))) { - NdisReleaseRWLock(entry->lock, &lockState); return; } isSrcNat = (((natAction & NAT_ACTION_SRC) && !reverse) || @@ -206,7 +203,6 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, } } else if (ctKey->dl_type == htons(ETH_TYPE_IPV6)){ // XXX: IPv6 packet not supported yet. - NdisReleaseRWLock(entry->lock, &lockState); return; } if (natAction & (NAT_ACTION_SRC_PORT | NAT_ACTION_DST_PORT)) { @@ -220,7 +216,6 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, } } } - NdisReleaseRWLock(entry->lock, &lockState); } diff --git a/datapath-windows/ovsext/Conntrack-related.c b/datapath-windows/ovsext/Conntrack-related.c index ec4b536..b798137 100644 --- a/datapath-windows/ovsext/Conntrack-related.c +++ b/datapath-windows/ovsext/Conntrack-related.c @@ -18,7 +18,7 @@ #include "Jhash.h" static PLIST_ENTRY ovsCtRelatedTable; /* Holds related entries */ -static UINT64 ctTotalRelatedEntries; +static ULONG ctTotalRelatedEntries; static OVS_CT_THREAD_CTX ctRelThreadCtx; static PNDIS_RW_LOCK_EX ovsCtRelatedLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; @@ -75,13 +75,11 @@ OvsCtRelatedLookup(OVS_CT_KEY key, UINT64 currentTime) POVS_CT_REL_ENTRY entry; LOCK_STATE_EX lockState; - NdisAcquireRWLockRead(ovsCtRelatedLockObj, &lockState, 0); - if (!ctTotalRelatedEntries) { - NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); return NULL; } + NdisAcquireRWLockRead(ovsCtRelatedLockObj, &lockState, 0); for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { /* XXX - Scan the table based on the hash instead */ LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { @@ -103,7 +101,7 @@ OvsCtRelatedEntryDelete(POVS_CT_REL_ENTRY entry) { RemoveEntryList(&entry->link); OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); - ctTotalRelatedEntries--; + NdisInterlockedDecrement((PLONG)&ctTotalRelatedEntries); } NDIS_STATUS @@ -139,7 +137,7 @@ OvsCtRelatedEntryCreate(UINT8 ipProto, NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); InsertHeadList(&ovsCtRelatedTable[hash & CT_HASH_TABLE_MASK], &entry->link); - ctTotalRelatedEntries++; + NdisInterlockedIncrement((PLONG)&ctTotalRelatedEntries); NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); return NDIS_STATUS_SUCCESS; @@ -150,11 +148,10 @@ OvsCtRelatedFlush() { PLIST_ENTRY link, next; POVS_CT_REL_ENTRY entry; - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); if (ctTotalRelatedEntries) { + NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { entry = CONTAINING_RECORD(link, OVS_CT_REL_ENTRY, link); @@ -189,9 +186,8 @@ OvsCtRelatedEntryCleaner(PVOID data) /* Lock has been freed by 'OvsCleanupCtRelated()' */ break; } - NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); + if (context->exit) { - NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); break; } @@ -201,6 +197,7 @@ OvsCtRelatedEntryCleaner(PVOID data) threadSleepTimeout = currentTime + CT_CLEANUP_INTERVAL; if (ctTotalRelatedEntries) { + NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { entry = CONTAINING_RECORD(link, OVS_CT_REL_ENTRY, link); diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c index add1491..7b54fba 100644 --- a/datapath-windows/ovsext/Conntrack.c +++ b/datapath-windows/ovsext/Conntrack.c @@ -34,7 +34,7 @@ static OVS_CT_THREAD_CTX ctThreadCtx; static PNDIS_RW_LOCK_EX *ovsCtBucketLock = NULL; static PNDIS_RW_LOCK_EX ovsCtNatLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; -static LONG ctTotalEntries; +static ULONG ctTotalEntries; static __inline OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple); static __inline NDIS_STATUS @@ -208,7 +208,6 @@ OvsPostCtEventEntry(POVS_CT_ENTRY entry, UINT8 type) { OVS_CT_EVENT_ENTRY ctEventEntry = {0}; NdisMoveMemory(&ctEventEntry.entry, entry, sizeof(OVS_CT_ENTRY)); - ctEventEntry.entry.lock = NULL; ctEventEntry.entry.parent = NULL; ctEventEntry.type = type; OvsPostCtEvent(&ctEventEntry); @@ -217,8 +216,8 @@ OvsPostCtEventEntry(POVS_CT_ENTRY entry, UINT8 type) static __inline VOID OvsCtIncrementCounters(POVS_CT_ENTRY entry, BOOLEAN reply, PNET_BUFFER_LIST nbl) { - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); + KIRQL irql = KeGetCurrentIrql(); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); if (reply) { entry->rev_key.byteCount+= OvsPacketLenNBL(nbl); entry->rev_key.packetCount++; @@ -226,11 +225,11 @@ OvsCtIncrementCounters(POVS_CT_ENTRY entry, BOOLEAN reply, PNET_BUFFER_LIST nbl) entry->key.byteCount += OvsPacketLenNBL(nbl); entry->key.packetCount++; } - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); Can we simply use NdisInterlocked functions instead of having a lock to perform increments? Also, in case the lock is only being acquired once within a function, there is no need to calculate the dispatch level, let NDIS do it. Calculating the dispatch level helps only in cases where the locks are being acquired multiple times within a given context. [AK]: The stats are part of ct entry structure, we need lock to make sure entry is not being modified by some other thread. Regarding calculating the dispatch level for lock, I did this to keep the code consistent in conntrack. } static __inline BOOLEAN -OvsCtAddEntry(POVS_SWITCH_CONTEXT switchContext, POVS_CT_ENTRY entry, +OvsCtAddEntry(POVS_CT_ENTRY entry, OvsConntrackKeyLookupCtx *ctx, PNAT_ACTION_INFO natInfo, UINT64 now) { @@ -261,18 +260,14 @@ OvsCtAddEntry(POVS_SWITCH_CONTEXT switchContext, POVS_CT_ENTRY entry, } entry->timestampStart = now; - entry->lock = NdisAllocateRWLock(switchContext->NdisFilterHandle); - if (entry->lock == NULL) { - OVS_LOG_ERROR("NdisAllocateRWLock failed : Insufficient resources"); - return FALSE; - } + NdisAllocateSpinLock(&(entry->lock)); UINT32 bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; entry->bucketLockRef = ovsCtBucketLock[bucketIdx]; NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockState, 0); InsertHeadList(&ovsConntrackTable[bucketIdx], &entry->link); - InterlockedIncrement((LONG volatile *)&ctTotalEntries); + NdisInterlockedIncrement((PLONG)&ctTotalEntries); NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockState); return TRUE; } @@ -351,8 +346,7 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, if (state != OVS_CS_F_INVALID && commit) { if (entry) { entry->parent = parentEntry; - if (OvsCtAddEntry(fwdCtx->switchContext, entry, ctx, - natInfo, currentTime)) { + if (OvsCtAddEntry(entry, ctx, natInfo, currentTime)) { *entryCreated = TRUE; } else { /* Unable to add entry to the list */ @@ -382,8 +376,7 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, UINT64 now) { CT_UPDATE_RES status; - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); + KIRQL irql = KeGetCurrentIrql(); switch (ipProto) { case IPPROTO_TCP: { @@ -394,20 +387,25 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, status = CT_UPDATE_INVALID; break; } + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; } Could you fix the style here for consistency by removing the braces for the case above. [AK]: Done case IPPROTO_ICMP: + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); status = OvsConntrackUpdateIcmpEntry(entry, reply, now); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; case IPPROTO_UDP: + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); status = OvsConntrackUpdateOtherEntry(entry, reply, now); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; default: status = CT_UPDATE_INVALID; break; } - NdisReleaseRWLock(entry->lock, &lockState); return status; } @@ -431,8 +429,8 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) if (entry == NULL) { return; } - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); + KIRQL irql = KeGetCurrentIrql(); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); if (forceDelete || OvsCtEntryExpired(entry)) { if (entry->natInfo.natAction) { LOCK_STATE_EX lockStateNat; @@ -442,13 +440,13 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) } OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); RemoveEntryList(&entry->link); - NdisReleaseRWLock(entry->lock, &lockState); - NdisFreeRWLock(entry->lock); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); + NdisFreeSpinLock(&(entry->lock)); OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); - InterlockedDecrement((LONG volatile*)&ctTotalEntries); + NdisInterlockedDecrement((PLONG)&ctTotalEntries); return; } - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); Why is the entry lock not being freed here? [AK]: Entry lock will be freed only when entry is deleted. } static __inline NDIS_STATUS @@ -499,7 +497,7 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) POVS_CT_ENTRY entry; BOOLEAN reply = FALSE; POVS_CT_ENTRY found = NULL; - LOCK_STATE_EX lockState, lockStateTable; + LOCK_STATE_EX lockStateTable; UINT32 bucketIdx; /* Reverse NAT must be performed before OvsCtLookup, so here * we simply need to flip the src and dst in key and compare @@ -512,11 +510,14 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) if (!ctTotalEntries) { return found; } + + KIRQL irql = KeGetCurrentIrql(); bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; NdisAcquireRWLockRead(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); LIST_FORALL(&ovsConntrackTable[bucketIdx], link) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); + if (OvsCtKeyAreSame(ctx->key, entry->key)) { found = entry; reply = FALSE; @@ -533,10 +534,10 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) } else { ctx->reply = reply; } - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; } - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); } NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); @@ -689,7 +690,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, POVS_CT_ENTRY entry = ctx->entry; UINT32 state = 0; PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; - LOCK_STATE_EX lockState, lockStateTable; + LOCK_STATE_EX lockStateTable; PNDIS_RW_LOCK_EX bucketLockRef = NULL; *entryCreated = FALSE; @@ -730,7 +731,8 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, } } if (entry) { - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + KIRQL irql = KeGetCurrentIrql(); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); No need to figure out IRQL every time. [AK]: I have kept it so that code is consistent in conntrack. if (key->ipKey.nwProto == IPPROTO_TCP) { /* Update the related bit if there is a parent */ if (entry->parent) { @@ -748,7 +750,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, /* Copy mark and label from entry into flowKey. If actions specify different mark and label, update the flowKey. */ OvsCtUpdateFlowKey(key, state, zone, entry->mark, &entry->labels); - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); } else { OvsCtUpdateFlowKey(key, state, zone, 0, NULL); } @@ -802,8 +804,6 @@ OvsCtSetMarkLabel(OvsFlowKey *key, MD_LABELS *labels, BOOLEAN *triggerUpdateEvent) { - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); if (mark) { OvsConntrackSetMark(key, entry, mark->value, mark->mask, triggerUpdateEvent); @@ -813,7 +813,6 @@ OvsCtSetMarkLabel(OvsFlowKey *key, OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask, triggerUpdateEvent); } - NdisReleaseRWLock(entry->lock, &lockState); } /* @@ -854,10 +853,11 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, { NDIS_STATUS status = NDIS_STATUS_SUCCESS; BOOLEAN triggerUpdateEvent = FALSE; + BOOLEAN entryCreated = FALSE; POVS_CT_ENTRY entry = NULL; PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; OvsConntrackKeyLookupCtx ctx = { 0 }; - LOCK_STATE_EX lockState, lockStateTable; + LOCK_STATE_EX lockStateTable; UINT64 currentTime; NdisGetCurrentSystemTime((LARGE_INTEGER *) ¤tTime); @@ -866,10 +866,9 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, /* Lookup Conntrack entries for a matching entry */ entry = OvsCtLookup(&ctx); - BOOLEAN entryCreated = FALSE; /* Delete entry in reverse direction if 'force' is specified */ - if (entry && force && ctx.reply) { + if (force && ctx.reply && entry) { PNDIS_RW_LOCK_EX bucketLockRef = entry->bucketLockRef; NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); OvsCtEntryDelete(entry, TRUE); @@ -877,34 +876,33 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, entry = NULL; } - if (!entry && commit && ctTotalEntries >= CT_MAX_ENTRIES) { - /* Don't proceed with processing if the max limit has been hit. - * This blocks only new entries from being created and doesn't - * affect existing connections. + if (entry) { + /* Increment stats for the entry if it wasn't tracked previously or + * if they are on different zones */ - OVS_LOG_ERROR("Conntrack Limit hit: %lu", ctTotalEntries); - return NDIS_STATUS_RESOURCES; - } - - /* Increment stats for the entry if it wasn't tracked previously or - * if they are on different zones - */ - if (entry && (entry->key.zone != key->ct.zone || - (!(key->ct.state & OVS_CS_F_TRACKED)))) { - OvsCtIncrementCounters(entry, ctx.reply, curNbl); - } + if ((entry->key.zone != key->ct.zone || + (!(key->ct.state & OVS_CS_F_TRACKED)))) { + OvsCtIncrementCounters(entry, ctx.reply, curNbl); + } + /* Process the entry and update CT flags */ + entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, + zone, natInfo, commit, currentTime, + &entryCreated); - if (!entry) { + } else { + if (commit && ctTotalEntries >= CT_MAX_ENTRIES) { + /* Don't proceed with processing if the max limit has been hit. + * This blocks only new entries from being created and doesn't + * affect existing connections. + */ + OVS_LOG_ERROR("Conntrack Limit hit: %lu", ctTotalEntries); + return NDIS_STATUS_RESOURCES; + } /* If no matching entry was found, create one and add New state */ entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, layers->l4Offset, &ctx, key, natInfo, commit, currentTime, &entryCreated); - } else { - /* Process the entry and update CT flags */ - entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, - zone, natInfo, commit, currentTime, - &entryCreated); } if (entry == NULL) { @@ -918,6 +916,8 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, * NAT_ACTION_REVERSE, yet entry->natInfo is NAT_ACTION_SRC or * NAT_ACTION_DST without NAT_ACTION_REVERSE */ + KIRQL irql = KeGetCurrentIrql(); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); if (natInfo->natAction != NAT_ACTION_NONE) { LOCK_STATE_EX lockStateNat; @@ -939,15 +939,14 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, OVS_LOG_ERROR("Error while parsing the FTP packet"); } } - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + /* Add original tuple information to flow Key */ if (entry->key.dl_type == ntohs(ETH_TYPE_IPV4)) { if (entry->parent != NULL) { POVS_CT_ENTRY parent = entry->parent; - LOCK_STATE_EX lockStateParent; - NdisAcquireRWLockRead(parent->lock, &lockStateParent, 0); + OVS_ACQUIRE_SPIN_LOCK(&(parent->lock), irql); OvsCtUpdateTuple(key, &parent->key); - NdisReleaseRWLock(parent->lock, &lockStateParent); + OVS_RELEASE_SPIN_LOCK(&(parent->lock), irql); } else { OvsCtUpdateTuple(key, &entry->key); } @@ -955,13 +954,11 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, if (entryCreated) { OvsPostCtEventEntry(entry, OVS_EVENT_CT_NEW); - } - if (postUpdateEvent && !entryCreated && triggerUpdateEvent) { + } else if (postUpdateEvent && triggerUpdateEvent) { OvsPostCtEventEntry(entry, OVS_EVENT_CT_UPDATE); } - NdisReleaseRWLock(entry->lock, &lockState); - + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); return status; } @@ -1738,8 +1735,9 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, UINT32 inIndex = instance->dumpState.index[1]; UINT32 i = CT_HASH_TABLE_SIZE; UINT32 outIndex = 0; + KIRQL irql = KeGetCurrentIrql(); + LOCK_STATE_EX lockStateTable; - LOCK_STATE_EX lockState, lockStateTable; if (ctTotalEntries) { for (i = inBucket; i < CT_HASH_TABLE_SIZE; i++) { PLIST_ENTRY head, link; @@ -1756,7 +1754,7 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, */ if (outIndex >= inIndex) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); rc = OvsCreateNlMsgFromCtEntry(entry, usrParamsCtx->outputBuffer, usrParamsCtx->outputLength, @@ -1765,7 +1763,7 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, msgIn->nlMsg.nlmsgPid, msgIn->nfGenMsg.version, 0); - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); if (rc != NDIS_STATUS_SUCCESS) { NdisReleaseRWLock(ovsCtBucketLock[i], &lockStateTable); return STATUS_UNSUCCESSFUL; diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h index 3be309e..7dc92a1 100644 --- a/datapath-windows/ovsext/Conntrack.h +++ b/datapath-windows/ovsext/Conntrack.h @@ -101,7 +101,7 @@ typedef struct _NAT_ACTION_INFO { typedef struct OVS_CT_ENTRY { /* Reference to ovsCtBucketLock of ovsConntrackTable.*/ PNDIS_RW_LOCK_EX bucketLockRef; - PNDIS_RW_LOCK_EX lock; /* Protects OVS_CT_ENTRY. */ + NDIS_SPIN_LOCK lock; /* Protects OVS_CT_ENTRY. */ OVS_CT_KEY key; OVS_CT_KEY rev_key; UINT64 expiration; diff --git a/datapath-windows/ovsext/Util.h b/datapath-windows/ovsext/Util.h index 6f02147..a9bccf3 100644 --- a/datapath-windows/ovsext/Util.h +++ b/datapath-windows/ovsext/Util.h @@ -105,6 +105,24 @@ VOID OvsAppendList(PLIST_ENTRY dst, PLIST_ENTRY src); #define BIT16(_x) ((UINT16)0x1 << (_x)) #define BIT32(_x) ((UINT32)0x1 << (_x)) +#define OVS_ACQUIRE_SPIN_LOCK(_pLock, _dispatchLevel) \ +{ \ + if (_dispatchLevel) { \ + NdisDprAcquireSpinLock(_pLock); \ + } else { \ + NdisAcquireSpinLock(_pLock); \ + } \ +} + +#define OVS_RELEASE_SPIN_LOCK(_pLock, _dispatchLevel) \ +{ \ + if (_dispatchLevel) { \ + NdisDprReleaseSpinLock(_pLock); \ + } else { \ + NdisReleaseSpinLock(_pLock); \ + } \ +} + BOOLEAN OvsCompareString(PVOID string1, PVOID string2); /*
diff --git a/datapath-windows/ovsext/Conntrack-nat.c b/datapath-windows/ovsext/Conntrack-nat.c index 316c946..da1814f 100644 --- a/datapath-windows/ovsext/Conntrack-nat.c +++ b/datapath-windows/ovsext/Conntrack-nat.c @@ -167,16 +167,13 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, { UINT32 natFlag; const struct ct_endpoint* endpoint; - LOCK_STATE_EX lockState; - /* XXX: Move conntrack locks out of NAT after implementing lock in NAT. */ - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + /* When it is NAT, only entry->rev_key contains NATTED address; When it is unNAT, only entry->key contains the UNNATTED address;*/ const OVS_CT_KEY *ctKey = reverse ? &entry->key : &entry->rev_key; BOOLEAN isSrcNat; if (!(natAction & (NAT_ACTION_SRC | NAT_ACTION_DST))) { - NdisReleaseRWLock(entry->lock, &lockState); return; } isSrcNat = (((natAction & NAT_ACTION_SRC) && !reverse) || @@ -206,7 +203,6 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, } } else if (ctKey->dl_type == htons(ETH_TYPE_IPV6)){ // XXX: IPv6 packet not supported yet. - NdisReleaseRWLock(entry->lock, &lockState); return; } if (natAction & (NAT_ACTION_SRC_PORT | NAT_ACTION_DST_PORT)) { @@ -220,7 +216,6 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, } } } - NdisReleaseRWLock(entry->lock, &lockState); } diff --git a/datapath-windows/ovsext/Conntrack-related.c b/datapath-windows/ovsext/Conntrack-related.c index ec4b536..b798137 100644 --- a/datapath-windows/ovsext/Conntrack-related.c +++ b/datapath-windows/ovsext/Conntrack-related.c @@ -18,7 +18,7 @@ #include "Jhash.h" static PLIST_ENTRY ovsCtRelatedTable; /* Holds related entries */ -static UINT64 ctTotalRelatedEntries; +static ULONG ctTotalRelatedEntries; static OVS_CT_THREAD_CTX ctRelThreadCtx; static PNDIS_RW_LOCK_EX ovsCtRelatedLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; @@ -75,13 +75,11 @@ OvsCtRelatedLookup(OVS_CT_KEY key, UINT64 currentTime) POVS_CT_REL_ENTRY entry; LOCK_STATE_EX lockState; - NdisAcquireRWLockRead(ovsCtRelatedLockObj, &lockState, 0); - if (!ctTotalRelatedEntries) { - NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); return NULL; } + NdisAcquireRWLockRead(ovsCtRelatedLockObj, &lockState, 0); for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { /* XXX - Scan the table based on the hash instead */ LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { @@ -103,7 +101,7 @@ OvsCtRelatedEntryDelete(POVS_CT_REL_ENTRY entry) { RemoveEntryList(&entry->link); OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); - ctTotalRelatedEntries--; + NdisInterlockedDecrement((PLONG)&ctTotalRelatedEntries); } NDIS_STATUS @@ -139,7 +137,7 @@ OvsCtRelatedEntryCreate(UINT8 ipProto, NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); InsertHeadList(&ovsCtRelatedTable[hash & CT_HASH_TABLE_MASK], &entry->link); - ctTotalRelatedEntries++; + NdisInterlockedIncrement((PLONG)&ctTotalRelatedEntries); NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); return NDIS_STATUS_SUCCESS; @@ -150,11 +148,10 @@ OvsCtRelatedFlush() { PLIST_ENTRY link, next; POVS_CT_REL_ENTRY entry; - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); if (ctTotalRelatedEntries) { + NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { entry = CONTAINING_RECORD(link, OVS_CT_REL_ENTRY, link); @@ -189,9 +186,8 @@ OvsCtRelatedEntryCleaner(PVOID data) /* Lock has been freed by 'OvsCleanupCtRelated()' */ break; } - NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); + if (context->exit) { - NdisReleaseRWLock(ovsCtRelatedLockObj, &lockState); break; } @@ -201,6 +197,7 @@ OvsCtRelatedEntryCleaner(PVOID data) threadSleepTimeout = currentTime + CT_CLEANUP_INTERVAL; if (ctTotalRelatedEntries) { + NdisAcquireRWLockWrite(ovsCtRelatedLockObj, &lockState, 0); for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { LIST_FORALL_SAFE(&ovsCtRelatedTable[i], link, next) { entry = CONTAINING_RECORD(link, OVS_CT_REL_ENTRY, link); diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c index add1491..7b54fba 100644 --- a/datapath-windows/ovsext/Conntrack.c +++ b/datapath-windows/ovsext/Conntrack.c @@ -34,7 +34,7 @@ static OVS_CT_THREAD_CTX ctThreadCtx; static PNDIS_RW_LOCK_EX *ovsCtBucketLock = NULL; static PNDIS_RW_LOCK_EX ovsCtNatLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; -static LONG ctTotalEntries; +static ULONG ctTotalEntries; static __inline OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple); static __inline NDIS_STATUS @@ -208,7 +208,6 @@ OvsPostCtEventEntry(POVS_CT_ENTRY entry, UINT8 type) { OVS_CT_EVENT_ENTRY ctEventEntry = {0}; NdisMoveMemory(&ctEventEntry.entry, entry, sizeof(OVS_CT_ENTRY)); - ctEventEntry.entry.lock = NULL; ctEventEntry.entry.parent = NULL; ctEventEntry.type = type; OvsPostCtEvent(&ctEventEntry); @@ -217,8 +216,8 @@ OvsPostCtEventEntry(POVS_CT_ENTRY entry, UINT8 type) static __inline VOID OvsCtIncrementCounters(POVS_CT_ENTRY entry, BOOLEAN reply, PNET_BUFFER_LIST nbl) { - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); + KIRQL irql = KeGetCurrentIrql(); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); if (reply) { entry->rev_key.byteCount+= OvsPacketLenNBL(nbl); entry->rev_key.packetCount++; @@ -226,11 +225,11 @@ OvsCtIncrementCounters(POVS_CT_ENTRY entry, BOOLEAN reply, PNET_BUFFER_LIST nbl) entry->key.byteCount += OvsPacketLenNBL(nbl); entry->key.packetCount++; } - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); } static __inline BOOLEAN -OvsCtAddEntry(POVS_SWITCH_CONTEXT switchContext, POVS_CT_ENTRY entry, +OvsCtAddEntry(POVS_CT_ENTRY entry, OvsConntrackKeyLookupCtx *ctx, PNAT_ACTION_INFO natInfo, UINT64 now) { @@ -261,18 +260,14 @@ OvsCtAddEntry(POVS_SWITCH_CONTEXT switchContext, POVS_CT_ENTRY entry, } entry->timestampStart = now; - entry->lock = NdisAllocateRWLock(switchContext->NdisFilterHandle); - if (entry->lock == NULL) { - OVS_LOG_ERROR("NdisAllocateRWLock failed : Insufficient resources"); - return FALSE; - } + NdisAllocateSpinLock(&(entry->lock)); UINT32 bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; entry->bucketLockRef = ovsCtBucketLock[bucketIdx]; NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockState, 0); InsertHeadList(&ovsConntrackTable[bucketIdx], &entry->link); - InterlockedIncrement((LONG volatile *)&ctTotalEntries); + NdisInterlockedIncrement((PLONG)&ctTotalEntries); NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockState); return TRUE; } @@ -351,8 +346,7 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, if (state != OVS_CS_F_INVALID && commit) { if (entry) { entry->parent = parentEntry; - if (OvsCtAddEntry(fwdCtx->switchContext, entry, ctx, - natInfo, currentTime)) { + if (OvsCtAddEntry(entry, ctx, natInfo, currentTime)) { *entryCreated = TRUE; } else { /* Unable to add entry to the list */ @@ -382,8 +376,7 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, UINT64 now) { CT_UPDATE_RES status; - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); + KIRQL irql = KeGetCurrentIrql(); switch (ipProto) { case IPPROTO_TCP: { @@ -394,20 +387,25 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, status = CT_UPDATE_INVALID; break; } + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; } case IPPROTO_ICMP: + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); status = OvsConntrackUpdateIcmpEntry(entry, reply, now); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; case IPPROTO_UDP: + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); status = OvsConntrackUpdateOtherEntry(entry, reply, now); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; default: status = CT_UPDATE_INVALID; break; } - NdisReleaseRWLock(entry->lock, &lockState); return status; } @@ -431,8 +429,8 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) if (entry == NULL) { return; } - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); + KIRQL irql = KeGetCurrentIrql(); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); if (forceDelete || OvsCtEntryExpired(entry)) { if (entry->natInfo.natAction) { LOCK_STATE_EX lockStateNat; @@ -442,13 +440,13 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) } OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); RemoveEntryList(&entry->link); - NdisReleaseRWLock(entry->lock, &lockState); - NdisFreeRWLock(entry->lock); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); + NdisFreeSpinLock(&(entry->lock)); OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); - InterlockedDecrement((LONG volatile*)&ctTotalEntries); + NdisInterlockedDecrement((PLONG)&ctTotalEntries); return; } - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); } static __inline NDIS_STATUS @@ -499,7 +497,7 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) POVS_CT_ENTRY entry; BOOLEAN reply = FALSE; POVS_CT_ENTRY found = NULL; - LOCK_STATE_EX lockState, lockStateTable; + LOCK_STATE_EX lockStateTable; UINT32 bucketIdx; /* Reverse NAT must be performed before OvsCtLookup, so here * we simply need to flip the src and dst in key and compare @@ -512,11 +510,14 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) if (!ctTotalEntries) { return found; } + + KIRQL irql = KeGetCurrentIrql(); bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; NdisAcquireRWLockRead(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); LIST_FORALL(&ovsConntrackTable[bucketIdx], link) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); + if (OvsCtKeyAreSame(ctx->key, entry->key)) { found = entry; reply = FALSE; @@ -533,10 +534,10 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) } else { ctx->reply = reply; } - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); break; } - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); } NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); @@ -689,7 +690,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, POVS_CT_ENTRY entry = ctx->entry; UINT32 state = 0; PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; - LOCK_STATE_EX lockState, lockStateTable; + LOCK_STATE_EX lockStateTable; PNDIS_RW_LOCK_EX bucketLockRef = NULL; *entryCreated = FALSE; @@ -730,7 +731,8 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, } } if (entry) { - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + KIRQL irql = KeGetCurrentIrql(); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); if (key->ipKey.nwProto == IPPROTO_TCP) { /* Update the related bit if there is a parent */ if (entry->parent) { @@ -748,7 +750,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, /* Copy mark and label from entry into flowKey. If actions specify different mark and label, update the flowKey. */ OvsCtUpdateFlowKey(key, state, zone, entry->mark, &entry->labels); - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); } else { OvsCtUpdateFlowKey(key, state, zone, 0, NULL); } @@ -802,8 +804,6 @@ OvsCtSetMarkLabel(OvsFlowKey *key, MD_LABELS *labels, BOOLEAN *triggerUpdateEvent) { - LOCK_STATE_EX lockState; - NdisAcquireRWLockWrite(entry->lock, &lockState, 0); if (mark) { OvsConntrackSetMark(key, entry, mark->value, mark->mask, triggerUpdateEvent); @@ -813,7 +813,6 @@ OvsCtSetMarkLabel(OvsFlowKey *key, OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask, triggerUpdateEvent); } - NdisReleaseRWLock(entry->lock, &lockState); } /* @@ -854,10 +853,11 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, { NDIS_STATUS status = NDIS_STATUS_SUCCESS; BOOLEAN triggerUpdateEvent = FALSE; + BOOLEAN entryCreated = FALSE; POVS_CT_ENTRY entry = NULL; PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; OvsConntrackKeyLookupCtx ctx = { 0 }; - LOCK_STATE_EX lockState, lockStateTable; + LOCK_STATE_EX lockStateTable; UINT64 currentTime; NdisGetCurrentSystemTime((LARGE_INTEGER *) ¤tTime); @@ -866,10 +866,9 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, /* Lookup Conntrack entries for a matching entry */ entry = OvsCtLookup(&ctx); - BOOLEAN entryCreated = FALSE; /* Delete entry in reverse direction if 'force' is specified */ - if (entry && force && ctx.reply) { + if (force && ctx.reply && entry) { PNDIS_RW_LOCK_EX bucketLockRef = entry->bucketLockRef; NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); OvsCtEntryDelete(entry, TRUE); @@ -877,34 +876,33 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, entry = NULL; } - if (!entry && commit && ctTotalEntries >= CT_MAX_ENTRIES) { - /* Don't proceed with processing if the max limit has been hit. - * This blocks only new entries from being created and doesn't - * affect existing connections. + if (entry) { + /* Increment stats for the entry if it wasn't tracked previously or + * if they are on different zones */ - OVS_LOG_ERROR("Conntrack Limit hit: %lu", ctTotalEntries); - return NDIS_STATUS_RESOURCES; - } - - /* Increment stats for the entry if it wasn't tracked previously or - * if they are on different zones - */ - if (entry && (entry->key.zone != key->ct.zone || - (!(key->ct.state & OVS_CS_F_TRACKED)))) { - OvsCtIncrementCounters(entry, ctx.reply, curNbl); - } + if ((entry->key.zone != key->ct.zone || + (!(key->ct.state & OVS_CS_F_TRACKED)))) { + OvsCtIncrementCounters(entry, ctx.reply, curNbl); + } + /* Process the entry and update CT flags */ + entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, + zone, natInfo, commit, currentTime, + &entryCreated); - if (!entry) { + } else { + if (commit && ctTotalEntries >= CT_MAX_ENTRIES) { + /* Don't proceed with processing if the max limit has been hit. + * This blocks only new entries from being created and doesn't + * affect existing connections. + */ + OVS_LOG_ERROR("Conntrack Limit hit: %lu", ctTotalEntries); + return NDIS_STATUS_RESOURCES; + } /* If no matching entry was found, create one and add New state */ entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, layers->l4Offset, &ctx, key, natInfo, commit, currentTime, &entryCreated); - } else { - /* Process the entry and update CT flags */ - entry = OvsProcessConntrackEntry(fwdCtx, layers->l4Offset, &ctx, key, - zone, natInfo, commit, currentTime, - &entryCreated); } if (entry == NULL) { @@ -918,6 +916,8 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, * NAT_ACTION_REVERSE, yet entry->natInfo is NAT_ACTION_SRC or * NAT_ACTION_DST without NAT_ACTION_REVERSE */ + KIRQL irql = KeGetCurrentIrql(); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); if (natInfo->natAction != NAT_ACTION_NONE) { LOCK_STATE_EX lockStateNat; @@ -939,15 +939,14 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, OVS_LOG_ERROR("Error while parsing the FTP packet"); } } - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + /* Add original tuple information to flow Key */ if (entry->key.dl_type == ntohs(ETH_TYPE_IPV4)) { if (entry->parent != NULL) { POVS_CT_ENTRY parent = entry->parent; - LOCK_STATE_EX lockStateParent; - NdisAcquireRWLockRead(parent->lock, &lockStateParent, 0); + OVS_ACQUIRE_SPIN_LOCK(&(parent->lock), irql); OvsCtUpdateTuple(key, &parent->key); - NdisReleaseRWLock(parent->lock, &lockStateParent); + OVS_RELEASE_SPIN_LOCK(&(parent->lock), irql); } else { OvsCtUpdateTuple(key, &entry->key); } @@ -955,13 +954,11 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, if (entryCreated) { OvsPostCtEventEntry(entry, OVS_EVENT_CT_NEW); - } - if (postUpdateEvent && !entryCreated && triggerUpdateEvent) { + } else if (postUpdateEvent && triggerUpdateEvent) { OvsPostCtEventEntry(entry, OVS_EVENT_CT_UPDATE); } - NdisReleaseRWLock(entry->lock, &lockState); - + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); return status; } @@ -1738,8 +1735,9 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, UINT32 inIndex = instance->dumpState.index[1]; UINT32 i = CT_HASH_TABLE_SIZE; UINT32 outIndex = 0; + KIRQL irql = KeGetCurrentIrql(); + LOCK_STATE_EX lockStateTable; - LOCK_STATE_EX lockState, lockStateTable; if (ctTotalEntries) { for (i = inBucket; i < CT_HASH_TABLE_SIZE; i++) { PLIST_ENTRY head, link; @@ -1756,7 +1754,7 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, */ if (outIndex >= inIndex) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); - NdisAcquireRWLockRead(entry->lock, &lockState, 0); + OVS_ACQUIRE_SPIN_LOCK(&(entry->lock), irql); rc = OvsCreateNlMsgFromCtEntry(entry, usrParamsCtx->outputBuffer, usrParamsCtx->outputLength, @@ -1765,7 +1763,7 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, msgIn->nlMsg.nlmsgPid, msgIn->nfGenMsg.version, 0); - NdisReleaseRWLock(entry->lock, &lockState); + OVS_RELEASE_SPIN_LOCK(&(entry->lock), irql); if (rc != NDIS_STATUS_SUCCESS) { NdisReleaseRWLock(ovsCtBucketLock[i], &lockStateTable); return STATUS_UNSUCCESSFUL; diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h index 3be309e..7dc92a1 100644 --- a/datapath-windows/ovsext/Conntrack.h +++ b/datapath-windows/ovsext/Conntrack.h @@ -101,7 +101,7 @@ typedef struct _NAT_ACTION_INFO { typedef struct OVS_CT_ENTRY { /* Reference to ovsCtBucketLock of ovsConntrackTable.*/ PNDIS_RW_LOCK_EX bucketLockRef; - PNDIS_RW_LOCK_EX lock; /* Protects OVS_CT_ENTRY. */ + NDIS_SPIN_LOCK lock; /* Protects OVS_CT_ENTRY. */ OVS_CT_KEY key; OVS_CT_KEY rev_key; UINT64 expiration; diff --git a/datapath-windows/ovsext/Util.h b/datapath-windows/ovsext/Util.h index 6f02147..a9bccf3 100644 --- a/datapath-windows/ovsext/Util.h +++ b/datapath-windows/ovsext/Util.h @@ -105,6 +105,24 @@ VOID OvsAppendList(PLIST_ENTRY dst, PLIST_ENTRY src); #define BIT16(_x) ((UINT16)0x1 << (_x)) #define BIT32(_x) ((UINT32)0x1 << (_x)) +#define OVS_ACQUIRE_SPIN_LOCK(_pLock, _dispatchLevel) \ +{ \ + if (_dispatchLevel) { \ + NdisDprAcquireSpinLock(_pLock); \ + } else { \ + NdisAcquireSpinLock(_pLock); \ + } \ +} + +#define OVS_RELEASE_SPIN_LOCK(_pLock, _dispatchLevel) \ +{ \ + if (_dispatchLevel) { \ + NdisDprReleaseSpinLock(_pLock); \ + } else { \ + NdisReleaseSpinLock(_pLock); \ + } \ +} + BOOLEAN OvsCompareString(PVOID string1, PVOID string2); /*
This patch mainly changes a ndis RW lock for conntrack entry to a spinlock along with some minor refactor in conntrack. Using spinlock instead of RW lock as RW locks causes performance hits when acquired/released multiple times. - Use NdisInterlockedXX wrapper api's instead of InterlockedXX. - Update 'ctTotalRelatedEntries' using interlocked functions. - Move conntrack lock out of NAT module. Signed-off-by: Anand Kumar <kumaranand@vmware.com> --- datapath-windows/ovsext/Conntrack-nat.c | 7 +- datapath-windows/ovsext/Conntrack-related.c | 17 ++-- datapath-windows/ovsext/Conntrack.c | 134 ++++++++++++++-------------- datapath-windows/ovsext/Conntrack.h | 2 +- datapath-windows/ovsext/Util.h | 18 ++++ 5 files changed, 93 insertions(+), 85 deletions(-)