From patchwork Mon Jan 29 18:27:59 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Anand Kumar X-Patchwork-Id: 867296 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3zVhZy5sXnz9s7s for ; Tue, 30 Jan 2018 07:54:34 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 2BE83D28; Mon, 29 Jan 2018 20:12:15 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id B2E024371 for ; Mon, 29 Jan 2018 18:29:06 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from EX13-EDG-OU-001.vmware.com (ex13-edg-ou-001.vmware.com [208.91.0.189]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id B08331CE for ; Mon, 29 Jan 2018 18:29:05 +0000 (UTC) Received: from sc9-mailhost2.vmware.com (10.113.161.72) by EX13-EDG-OU-001.vmware.com (10.113.208.155) with Microsoft SMTP Server id 15.0.1156.6; Mon, 29 Jan 2018 10:29:03 -0800 Received: from localhost.localdomain (win-anand1.prom.eng.vmware.com [10.33.78.106]) by sc9-mailhost2.vmware.com (Postfix) with ESMTP id 1658CB036E; Mon, 29 Jan 2018 10:29:05 -0800 (PST) From: Anand Kumar To: Date: Mon, 29 Jan 2018 10:27:59 -0800 Message-ID: <20180129182801.3360-2-kumaranand@vmware.com> X-Mailer: git-send-email 2.9.3.windows.1 In-Reply-To: <20180129182801.3360-1-kumaranand@vmware.com> References: <20180129182801.3360-1-kumaranand@vmware.com> MIME-Version: 1.0 Received-SPF: None (EX13-EDG-OU-001.vmware.com: kumaranand@vmware.com does not designate permitted sender hosts) X-Spam-Status: No, score=-4.2 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_MED, T_RP_MATCHES_RCVD autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Subject: [ovs-dev] [PATCH v4 1/3] datapath-windows: Refactor conntrack code. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Some of the functions and code are refactored so that new conntrack lock can be implemented Signed-off-by: Anand Kumar Acked-by: Alin Gabriel Serdean --- datapath-windows/ovsext/Conntrack-nat.c | 11 +- datapath-windows/ovsext/Conntrack.c | 174 ++++++++++++++++++-------------- datapath-windows/ovsext/Conntrack.h | 4 - 3 files changed, 103 insertions(+), 86 deletions(-) diff --git a/datapath-windows/ovsext/Conntrack-nat.c b/datapath-windows/ovsext/Conntrack-nat.c index c778f12..7975770 100644 --- a/datapath-windows/ovsext/Conntrack-nat.c +++ b/datapath-windows/ovsext/Conntrack-nat.c @@ -93,26 +93,23 @@ NTSTATUS OvsNatInit() sizeof(LIST_ENTRY) * NAT_HASH_TABLE_SIZE, OVS_CT_POOL_TAG); if (ovsNatTable == NULL) { - goto failNoMem; + return STATUS_INSUFFICIENT_RESOURCES; } ovsUnNatTable = OvsAllocateMemoryWithTag( sizeof(LIST_ENTRY) * NAT_HASH_TABLE_SIZE, OVS_CT_POOL_TAG); if (ovsUnNatTable == NULL) { - goto freeNatTable; + OvsFreeMemoryWithTag(ovsNatTable, OVS_CT_POOL_TAG); + return STATUS_INSUFFICIENT_RESOURCES; } for (int i = 0; i < NAT_HASH_TABLE_SIZE; i++) { InitializeListHead(&ovsNatTable[i]); InitializeListHead(&ovsUnNatTable[i]); } - return STATUS_SUCCESS; -freeNatTable: - OvsFreeMemoryWithTag(ovsNatTable, OVS_CT_POOL_TAG); -failNoMem: - return STATUS_INSUFFICIENT_RESOURCES; + return STATUS_SUCCESS; } /* diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c index 169ec4f..3cde836 100644 --- a/datapath-windows/ovsext/Conntrack.c +++ b/datapath-windows/ovsext/Conntrack.c @@ -33,7 +33,7 @@ static PLIST_ENTRY ovsConntrackTable; static OVS_CT_THREAD_CTX ctThreadCtx; static PNDIS_RW_LOCK_EX ovsConntrackLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; -static UINT64 ctTotalEntries; +static LONG ctTotalEntries; static __inline OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple); static __inline NDIS_STATUS @@ -212,7 +212,7 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, OvsConntrackKeyLookupCtx *ctx, InsertHeadList(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK], &entry->link); - ctTotalEntries++; + InterlockedIncrement((LONG volatile *)&ctTotalEntries); return TRUE; } @@ -235,11 +235,6 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, *entryCreated = FALSE; state |= OVS_CS_F_NEW; - parentEntry = OvsCtRelatedLookup(ctx->key, currentTime); - if (parentEntry != NULL) { - state |= OVS_CS_F_RELATED; - } - switch (ipProto) { case IPPROTO_TCP: { @@ -283,6 +278,11 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, break; } + parentEntry = OvsCtRelatedLookup(ctx->key, currentTime); + if (parentEntry != NULL && state != OVS_CS_F_INVALID) { + state |= OVS_CS_F_RELATED; + } + if (state != OVS_CS_F_INVALID && commit) { if (entry) { entry->parent = parentEntry; @@ -315,6 +315,7 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, BOOLEAN reply, UINT64 now) { + CT_UPDATE_RES status; switch (ipProto) { case IPPROTO_TCP: { @@ -322,32 +323,23 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, const TCPHdr *tcp; tcp = OvsGetTcp(nbl, l4Offset, &tcpStorage); if (!tcp) { - return CT_UPDATE_INVALID; + status = CT_UPDATE_INVALID; + break; } - return OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); + status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); + break; } case IPPROTO_ICMP: - return OvsConntrackUpdateIcmpEntry(entry, reply, now); + status = OvsConntrackUpdateIcmpEntry(entry, reply, now); + break; case IPPROTO_UDP: - return OvsConntrackUpdateOtherEntry(entry, reply, now); + status = OvsConntrackUpdateOtherEntry(entry, reply, now); + break; default: - return CT_UPDATE_INVALID; - } -} - -static __inline VOID -OvsCtEntryDelete(POVS_CT_ENTRY entry) -{ - if (entry == NULL) { - return; - } - if (entry->natInfo.natAction) { - OvsNatDeleteKey(&entry->key); + status = CT_UPDATE_INVALID; + break; } - OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); - RemoveEntryList(&entry->link); - OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); - ctTotalEntries--; + return status; } static __inline BOOLEAN @@ -358,6 +350,24 @@ OvsCtEntryExpired(POVS_CT_ENTRY entry) return entry->expiration < currentTime; } +static __inline VOID +OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) +{ + if (entry == NULL) { + return; + } + if (forceDelete || OvsCtEntryExpired(entry)) { + if (entry->natInfo.natAction) { + OvsNatDeleteKey(&entry->key); + } + OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); + RemoveEntryList(&entry->link); + OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); + InterlockedDecrement((LONG volatile*)&ctTotalEntries); + return; + } +} + static __inline NDIS_STATUS OvsDetectCtPacket(OvsForwardingContext *fwdCtx, OvsFlowKey *key, @@ -425,21 +435,20 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) if (OvsCtKeyAreSame(ctx->key, entry->key)) { found = entry; reply = FALSE; - break; } - if (OvsCtKeyAreSame(revCtxKey, entry->key)) { + if (!found && OvsCtKeyAreSame(revCtxKey, entry->key)) { found = entry; reply = TRUE; - break; } - } - if (found) { - if (OvsCtEntryExpired(found)) { - found = NULL; - } else { - ctx->reply = reply; + if (found) { + if (OvsCtEntryExpired(found)) { + found = NULL; + } else { + ctx->reply = reply; + } + break; } } @@ -613,7 +622,7 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, break; case CT_UPDATE_NEW: //Delete and update the Conntrack - OvsCtEntryDelete(ctx->entry); + OvsCtEntryDelete(ctx->entry, TRUE); ctx->entry = NULL; entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, l4Offset, ctx, key, natInfo, commit, currentTime, @@ -689,6 +698,41 @@ OvsConntrackSetLabels(OvsFlowKey *key, sizeof(struct ovs_key_ct_labels)); } +static void +OvsCtSetMarkLabel(OvsFlowKey *key, + POVS_CT_ENTRY entry, + MD_MARK *mark, + MD_LABELS *labels, + BOOLEAN *triggerUpdateEvent) +{ + if (mark) { + OvsConntrackSetMark(key, entry, mark->value, mark->mask, + triggerUpdateEvent); + } + + if (labels) { + OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask, + triggerUpdateEvent); + } +} + +static __inline void +OvsCtUpdateTuple(OvsFlowKey *key, OVS_CT_KEY *ctKey) +{ + key->ct.tuple_ipv4.ipv4_src = ctKey->src.addr.ipv4_aligned; + key->ct.tuple_ipv4.ipv4_dst = ctKey->dst.addr.ipv4_aligned; + key->ct.tuple_ipv4.ipv4_proto = ctKey->nw_proto; + + /* Orig tuple Port is overloaded to take in ICMP-Type & Code */ + /* This mimics the behavior in lib/conntrack.c*/ + key->ct.tuple_ipv4.src_port = ctKey->nw_proto != IPPROTO_ICMP ? + ctKey->src.port : + htons(ctKey->src.icmp_type); + key->ct.tuple_ipv4.dst_port = ctKey->nw_proto != IPPROTO_ICMP ? + ctKey->dst.port : + htons(ctKey->src.icmp_code); +} + static __inline NDIS_STATUS OvsCtExecute_(OvsForwardingContext *fwdCtx, OvsFlowKey *key, @@ -723,7 +767,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, /* Delete entry in reverse direction if 'force' is specified */ if (entry && force && ctx.reply) { - OvsCtEntryDelete(entry); + OvsCtEntryDelete(entry, TRUE); entry = NULL; } @@ -757,6 +801,9 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, &entryCreated); } + if (entry == NULL) { + return status; + } /* * Note that natInfo is not the same as entry->natInfo here. natInfo * is decided by action in the openflow rule, entry->natInfo is decided @@ -764,23 +811,15 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, * NAT_ACTION_REVERSE, yet entry->natInfo is NAT_ACTION_SRC or * NAT_ACTION_DST without NAT_ACTION_REVERSE */ - if (entry && natInfo->natAction != NAT_ACTION_NONE) + if (natInfo->natAction != NAT_ACTION_NONE) { OvsNatPacket(fwdCtx, entry, entry->natInfo.natAction, key, ctx.reply); } - if (entry && mark) { - OvsConntrackSetMark(key, entry, mark->value, mark->mask, - &triggerUpdateEvent); - } - - if (entry && labels) { - OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask, - &triggerUpdateEvent); - } + OvsCtSetMarkLabel(key, entry, mark, labels, &triggerUpdateEvent); - if (entry && OvsDetectFtpPacket(key)) { + if (OvsDetectFtpPacket(key)) { /* FTP parser will always be loaded */ UNREFERENCED_PARAMETER(helper); @@ -792,33 +831,19 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, } /* Add original tuple information to flow Key */ - if (entry && entry->key.dl_type == ntohs(ETH_TYPE_IPV4)) { - OVS_CT_KEY *ctKey; + if (entry->key.dl_type == ntohs(ETH_TYPE_IPV4)) { if (entry->parent != NULL) { POVS_CT_ENTRY parent = entry->parent; - ctKey = &parent->key; + OvsCtUpdateTuple(key, &parent->key); } else { - ctKey = &entry->key; + OvsCtUpdateTuple(key, &entry->key); } - - key->ct.tuple_ipv4.ipv4_src = ctKey->src.addr.ipv4_aligned; - key->ct.tuple_ipv4.ipv4_dst = ctKey->dst.addr.ipv4_aligned; - key->ct.tuple_ipv4.ipv4_proto = ctKey->nw_proto; - - /* Orig tuple Port is overloaded to take in ICMP-Type & Code */ - /* This mimics the behavior in lib/conntrack.c*/ - key->ct.tuple_ipv4.src_port = ctKey->nw_proto != IPPROTO_ICMP ? - ctKey->src.port : - htons(ctKey->src.icmp_type); - key->ct.tuple_ipv4.dst_port = ctKey->nw_proto != IPPROTO_ICMP ? - ctKey->dst.port : - htons(ctKey->src.icmp_code); } - if (entryCreated && entry) { + if (entryCreated) { OvsPostCtEventEntry(entry, OVS_EVENT_CT_NEW); } - if (postUpdateEvent && entry && !entryCreated && triggerUpdateEvent) { + if (postUpdateEvent && !entryCreated && triggerUpdateEvent) { OvsPostCtEventEntry(entry, OVS_EVENT_CT_UPDATE); } @@ -1003,9 +1028,7 @@ OvsConntrackEntryCleaner(PVOID data) for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { LIST_FORALL_SAFE(&ovsConntrackTable[i], link, next) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); - if (entry && OvsCtEntryExpired(entry)) { - OvsCtEntryDelete(entry); - } + OvsCtEntryDelete(entry, FALSE); } } } @@ -1033,7 +1056,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); if (ctTotalEntries) { - for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { + for (UINT32 i = 0; i < CT_HASH_TABLE_SIZE; i++) { LIST_FORALL_SAFE(&ovsConntrackTable[i], link, next) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); if (tuple) { @@ -1044,17 +1067,17 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) tuple->src_port == entry->key.src.port && tuple->dst_port == entry->key.dst.port && (zone ? entry->key.zone == zone: TRUE)) { - OvsCtEntryDelete(entry); + OvsCtEntryDelete(entry, TRUE); } else if (tuple->ipv4_src == entry->key.src.addr.ipv4_aligned && tuple->ipv4_dst == entry->key.dst.addr.ipv4_aligned && tuple->ipv4_proto == entry->key.nw_proto && tuple->src_port == entry->key.src.icmp_type && tuple->dst_port == entry->key.src.icmp_code && (zone ? entry->key.zone == zone: TRUE)) { - OvsCtEntryDelete(entry); + OvsCtEntryDelete(entry, TRUE); } } else if (!zone || zone == entry->key.zone) { - OvsCtEntryDelete(entry); + OvsCtEntryDelete(entry, TRUE); } } } @@ -1434,6 +1457,7 @@ OvsCreateNlMsgFromCtEntry(POVS_CT_ENTRY entry, UINT16 nlmsgFlags = NLM_F_CREATE; NdisGetCurrentSystemTime((LARGE_INTEGER *)¤tTime); UINT8 nfgenFamily = 0; + if (entry->key.dl_type == htons(ETH_TYPE_IPV4)) { nfgenFamily = AF_INET; } else if (entry->key.dl_type == htons(ETH_TYPE_IPV6)) { diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h index ef1fe21..35075db 100644 --- a/datapath-windows/ovsext/Conntrack.h +++ b/datapath-windows/ovsext/Conntrack.h @@ -228,8 +228,4 @@ NDIS_STATUS OvsCtHandleFtp(PNET_BUFFER_LIST curNbl, BOOLEAN request); UINT32 OvsHashCtKey(const OVS_CT_KEY *key); -BOOLEAN OvsCtKeyAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey); -POVS_CT_ENTRY OvsCtLookup(OvsConntrackKeyLookupCtx *ctx); - - #endif /* __OVS_CONNTRACK_H_ */ From patchwork Mon Jan 29 18:28:00 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Anand Kumar X-Patchwork-Id: 867274 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3zVggh4Z3lz9sNc for ; Tue, 30 Jan 2018 07:13:36 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id 8F2754379; Mon, 29 Jan 2018 18:29:10 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id 3D5654372 for ; Mon, 29 Jan 2018 18:29:07 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from EX13-EDG-OU-001.vmware.com (ex13-edg-ou-001.vmware.com [208.91.0.189]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id BA47A149 for ; Mon, 29 Jan 2018 18:29:06 +0000 (UTC) Received: from sc9-mailhost2.vmware.com (10.113.161.72) by EX13-EDG-OU-001.vmware.com (10.113.208.155) with Microsoft SMTP Server id 15.0.1156.6; Mon, 29 Jan 2018 10:29:03 -0800 Received: from localhost.localdomain (win-anand1.prom.eng.vmware.com [10.33.78.106]) by sc9-mailhost2.vmware.com (Postfix) with ESMTP id 2AD35B03C9; Mon, 29 Jan 2018 10:29:05 -0800 (PST) From: Anand Kumar To: Date: Mon, 29 Jan 2018 10:28:00 -0800 Message-ID: <20180129182801.3360-3-kumaranand@vmware.com> X-Mailer: git-send-email 2.9.3.windows.1 In-Reply-To: <20180129182801.3360-1-kumaranand@vmware.com> References: <20180129182801.3360-1-kumaranand@vmware.com> MIME-Version: 1.0 Received-SPF: None (EX13-EDG-OU-001.vmware.com: kumaranand@vmware.com does not designate permitted sender hosts) X-Spam-Status: No, score=-4.2 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_MED, T_RP_MATCHES_RCVD autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Subject: [ovs-dev] [PATCH v4 2/3] datapath-windows: Add a global level RW lock for NAT X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Currently NAT module relies on the existing conntrack lock. This patch provides a basic lock implementation for NAT module in conntrack. Signed-off-by: Anand Kumar Acked-by: Alin Gabriel Serdean --- datapath-windows/ovsext/Conntrack.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c index 3cde836..7d56a50 100644 --- a/datapath-windows/ovsext/Conntrack.c +++ b/datapath-windows/ovsext/Conntrack.c @@ -32,6 +32,7 @@ KSTART_ROUTINE OvsConntrackEntryCleaner; static PLIST_ENTRY ovsConntrackTable; static OVS_CT_THREAD_CTX ctThreadCtx; static PNDIS_RW_LOCK_EX ovsConntrackLockObj; +static PNDIS_RW_LOCK_EX ovsCtNatLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; static LONG ctTotalEntries; @@ -58,6 +59,13 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) return STATUS_INSUFFICIENT_RESOURCES; } + ovsCtNatLockObj = NdisAllocateRWLock(context->NdisFilterHandle); + if (ovsCtNatLockObj == NULL) { + NdisFreeRWLock(ovsConntrackLockObj); + ovsConntrackLockObj = NULL; + return STATUS_INSUFFICIENT_RESOURCES; + } + /* Init the Hash Buffer */ ovsConntrackTable = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY) * CT_HASH_TABLE_SIZE, @@ -65,6 +73,8 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) if (ovsConntrackTable == NULL) { NdisFreeRWLock(ovsConntrackLockObj); ovsConntrackLockObj = NULL; + NdisFreeRWLock(ovsCtNatLockObj); + ovsCtNatLockObj = NULL; return STATUS_INSUFFICIENT_RESOURCES; } @@ -82,6 +92,9 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) NdisFreeRWLock(ovsConntrackLockObj); ovsConntrackLockObj = NULL; + NdisFreeRWLock(ovsCtNatLockObj); + ovsCtNatLockObj = NULL; + OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); ovsConntrackTable = NULL; @@ -111,7 +124,7 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) VOID OvsCleanupConntrack(VOID) { - LOCK_STATE_EX lockState; + LOCK_STATE_EX lockState, lockStateNat; NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); ctThreadCtx.exit = 1; KeSetEvent(&ctThreadCtx.event, 0, FALSE); @@ -131,7 +144,11 @@ OvsCleanupConntrack(VOID) NdisFreeRWLock(ovsConntrackLockObj); ovsConntrackLockObj = NULL; + NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatCleanup(); + NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); + NdisFreeRWLock(ovsCtNatLockObj); + ovsCtNatLockObj = NULL; } static __inline VOID @@ -197,15 +214,19 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, OvsConntrackKeyLookupCtx *ctx, if (natInfo == NULL) { entry->natInfo.natAction = NAT_ACTION_NONE; } else { + LOCK_STATE_EX lockStateNat; + NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); if (OvsIsForwardNat(natInfo->natAction)) { entry->natInfo = *natInfo; if (!OvsNatTranslateCtEntry(entry)) { + NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); return FALSE; } ctx->hash = OvsHashCtKey(&entry->key); } else { entry->natInfo.natAction = natInfo->natAction; } + NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); } entry->timestampStart = now; @@ -358,7 +379,10 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) } if (forceDelete || OvsCtEntryExpired(entry)) { if (entry->natInfo.natAction) { + LOCK_STATE_EX lockStateNat; + NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatDeleteKey(&entry->key); + NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); } OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); RemoveEntryList(&entry->link); @@ -560,7 +584,10 @@ OvsCtSetupLookupCtx(OvsFlowKey *flowKey, return NDIS_STATUS_INVALID_PACKET; } + LOCK_STATE_EX lockStateNat; + NdisAcquireRWLockRead(ovsCtNatLockObj, &lockStateNat, 0); natEntry = OvsNatLookup(&ctx->key, TRUE); + NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); if (natEntry) { /* Translate address first for reverse NAT */ ctx->key = natEntry->ctEntry->key; @@ -813,8 +840,11 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, */ if (natInfo->natAction != NAT_ACTION_NONE) { + LOCK_STATE_EX lockStateNat; + NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatPacket(fwdCtx, entry, entry->natInfo.natAction, key, ctx.reply); + NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); } OvsCtSetMarkLabel(key, entry, mark, labels, &triggerUpdateEvent); @@ -1052,7 +1082,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) PLIST_ENTRY link, next; POVS_CT_ENTRY entry; - LOCK_STATE_EX lockState; + LOCK_STATE_EX lockState, lockStateNat; NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); if (ctTotalEntries) { @@ -1083,7 +1113,9 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) } } + NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatFlush(zone); + NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); NdisReleaseRWLock(ovsConntrackLockObj, &lockState); return NDIS_STATUS_SUCCESS; } From patchwork Mon Jan 29 18:28:01 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Anand Kumar X-Patchwork-Id: 867302 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=openvswitch.org (client-ip=140.211.169.12; helo=mail.linuxfoundation.org; envelope-from=ovs-dev-bounces@openvswitch.org; receiver=) Received: from mail.linuxfoundation.org (mail.linuxfoundation.org [140.211.169.12]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3zVhnd2Ysmz9rxj for ; Tue, 30 Jan 2018 08:03:49 +1100 (AEDT) Received: from mail.linux-foundation.org (localhost [127.0.0.1]) by mail.linuxfoundation.org (Postfix) with ESMTP id BDB9210A4; Mon, 29 Jan 2018 20:12:55 +0000 (UTC) X-Original-To: dev@openvswitch.org Delivered-To: ovs-dev@mail.linuxfoundation.org Received: from smtp1.linuxfoundation.org (smtp1.linux-foundation.org [172.17.192.35]) by mail.linuxfoundation.org (Postfix) with ESMTPS id B48AF4373 for ; Mon, 29 Jan 2018 18:29:08 +0000 (UTC) X-Greylist: domain auto-whitelisted by SQLgrey-1.7.6 Received: from EX13-EDG-OU-001.vmware.com (ex13-edg-ou-001.vmware.com [208.91.0.189]) by smtp1.linuxfoundation.org (Postfix) with ESMTPS id 459F8149 for ; Mon, 29 Jan 2018 18:29:07 +0000 (UTC) Received: from sc9-mailhost2.vmware.com (10.113.161.72) by EX13-EDG-OU-001.vmware.com (10.113.208.155) with Microsoft SMTP Server id 15.0.1156.6; Mon, 29 Jan 2018 10:29:03 -0800 Received: from localhost.localdomain (win-anand1.prom.eng.vmware.com [10.33.78.106]) by sc9-mailhost2.vmware.com (Postfix) with ESMTP id 47CD3B036E; Mon, 29 Jan 2018 10:29:05 -0800 (PST) From: Anand Kumar To: Date: Mon, 29 Jan 2018 10:28:01 -0800 Message-ID: <20180129182801.3360-4-kumaranand@vmware.com> X-Mailer: git-send-email 2.9.3.windows.1 In-Reply-To: <20180129182801.3360-1-kumaranand@vmware.com> References: <20180129182801.3360-1-kumaranand@vmware.com> MIME-Version: 1.0 Received-SPF: None (EX13-EDG-OU-001.vmware.com: kumaranand@vmware.com does not designate permitted sender hosts) X-Spam-Status: No, score=-4.2 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_MED, T_RP_MATCHES_RCVD autolearn=ham version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on smtp1.linux-foundation.org Subject: [ovs-dev] [PATCH v4 3/3] datapath-windows: Optimize conntrack lock implementation. X-BeenThere: ovs-dev@openvswitch.org X-Mailman-Version: 2.1.12 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: ovs-dev-bounces@openvswitch.org Errors-To: ovs-dev-bounces@openvswitch.org Currently, there is one global lock for conntrack module, which protects conntrack entries and conntrack table. All the NAT operations are performed holding this lock. This becomes inefficient, as the number of conntrack entries grow. With new implementation, we will have two PNDIS_RW_LOCK_EX locks in conntrack. 1. ovsCtBucketLock - one rw lock per bucket of the conntrack table, which is shared by all the ct entries that belong to the same bucket. 2. lock - a rw lock in OVS_CT_ENTRY structure that protects the members of conntrack entry. Also, OVS_CT_ENTRY structure will have a lock reference(bucketLockRef) to the corresponding OvsCtBucketLock of conntrack table. We need this reference to retrieve ovsCtBucketLock from ct entry for delete operation. Signed-off-by: Anand Kumar Acked-by: Alin Gabriel Serdean Signed-off-by: Anand Kumar Acked-by: Alin Gabriel Serdean Signed-off-by: Anand Kumar --- v1->v2: Address potential memory leak in conntrack initialization. v2->v3: Fix invalid memory access after deleting ct entry. v3->v4: Address warning "uninitialized memory" --- datapath-windows/ovsext/Conntrack-nat.c | 6 + datapath-windows/ovsext/Conntrack.c | 233 ++++++++++++++++++++------------ datapath-windows/ovsext/Conntrack.h | 3 + 3 files changed, 157 insertions(+), 85 deletions(-) diff --git a/datapath-windows/ovsext/Conntrack-nat.c b/datapath-windows/ovsext/Conntrack-nat.c index 7975770..316c946 100644 --- a/datapath-windows/ovsext/Conntrack-nat.c +++ b/datapath-windows/ovsext/Conntrack-nat.c @@ -167,12 +167,16 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, { UINT32 natFlag; const struct ct_endpoint* endpoint; + LOCK_STATE_EX lockState; + /* XXX: Move conntrack locks out of NAT after implementing lock in NAT. */ + NdisAcquireRWLockRead(entry->lock, &lockState, 0); /* When it is NAT, only entry->rev_key contains NATTED address; When it is unNAT, only entry->key contains the UNNATTED address;*/ const OVS_CT_KEY *ctKey = reverse ? &entry->key : &entry->rev_key; BOOLEAN isSrcNat; if (!(natAction & (NAT_ACTION_SRC | NAT_ACTION_DST))) { + NdisReleaseRWLock(entry->lock, &lockState); return; } isSrcNat = (((natAction & NAT_ACTION_SRC) && !reverse) || @@ -202,6 +206,7 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, } } else if (ctKey->dl_type == htons(ETH_TYPE_IPV6)){ // XXX: IPv6 packet not supported yet. + NdisReleaseRWLock(entry->lock, &lockState); return; } if (natAction & (NAT_ACTION_SRC_PORT | NAT_ACTION_DST_PORT)) { @@ -215,6 +220,7 @@ OvsNatPacket(OvsForwardingContext *ovsFwdCtx, } } } + NdisReleaseRWLock(entry->lock, &lockState); } diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c index 7d56a50..c90c000 100644 --- a/datapath-windows/ovsext/Conntrack.c +++ b/datapath-windows/ovsext/Conntrack.c @@ -31,7 +31,7 @@ KSTART_ROUTINE OvsConntrackEntryCleaner; static PLIST_ENTRY ovsConntrackTable; static OVS_CT_THREAD_CTX ctThreadCtx; -static PNDIS_RW_LOCK_EX ovsConntrackLockObj; +static PNDIS_RW_LOCK_EX *ovsCtBucketLock = NULL; static PNDIS_RW_LOCK_EX ovsCtNatLockObj; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; static LONG ctTotalEntries; @@ -49,20 +49,14 @@ MapNlToCtTuple(POVS_MESSAGE msgIn, PNL_ATTR attr, NTSTATUS OvsInitConntrack(POVS_SWITCH_CONTEXT context) { - NTSTATUS status; + NTSTATUS status = STATUS_SUCCESS; HANDLE threadHandle = NULL; ctTotalEntries = 0; + UINT32 numBucketLocks = CT_HASH_TABLE_SIZE; /* Init the sync-lock */ - ovsConntrackLockObj = NdisAllocateRWLock(context->NdisFilterHandle); - if (ovsConntrackLockObj == NULL) { - return STATUS_INSUFFICIENT_RESOURCES; - } - ovsCtNatLockObj = NdisAllocateRWLock(context->NdisFilterHandle); if (ovsCtNatLockObj == NULL) { - NdisFreeRWLock(ovsConntrackLockObj); - ovsConntrackLockObj = NULL; return STATUS_INSUFFICIENT_RESOURCES; } @@ -71,15 +65,27 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) * CT_HASH_TABLE_SIZE, OVS_CT_POOL_TAG); if (ovsConntrackTable == NULL) { - NdisFreeRWLock(ovsConntrackLockObj); - ovsConntrackLockObj = NULL; NdisFreeRWLock(ovsCtNatLockObj); ovsCtNatLockObj = NULL; return STATUS_INSUFFICIENT_RESOURCES; } - for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { + ovsCtBucketLock = OvsAllocateMemoryWithTag(sizeof(PNDIS_RW_LOCK_EX) + * CT_HASH_TABLE_SIZE, + OVS_CT_POOL_TAG); + if (ovsCtBucketLock == NULL) { + status = STATUS_INSUFFICIENT_RESOURCES; + goto freeTable; + } + + for (UINT32 i = 0; i < CT_HASH_TABLE_SIZE; i++) { InitializeListHead(&ovsConntrackTable[i]); + ovsCtBucketLock[i] = NdisAllocateRWLock(context->NdisFilterHandle); + if (ovsCtBucketLock[i] == NULL) { + status = STATUS_INSUFFICIENT_RESOURCES; + numBucketLocks = i; + goto freeBucketLock; + } } /* Init CT Cleaner Thread */ @@ -89,16 +95,7 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) &ctThreadCtx); if (status != STATUS_SUCCESS) { - NdisFreeRWLock(ovsConntrackLockObj); - ovsConntrackLockObj = NULL; - - NdisFreeRWLock(ovsCtNatLockObj); - ovsCtNatLockObj = NULL; - - OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); - ovsConntrackTable = NULL; - - return status; + goto freeBucketLock; } ObReferenceObjectByHandle(threadHandle, SYNCHRONIZE, NULL, KernelMode, @@ -110,9 +107,23 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) if (status != STATUS_SUCCESS) { OvsCleanupConntrack(); - return status; } return STATUS_SUCCESS; + +freeBucketLock: + for (UINT32 i = 0; i < numBucketLocks; i++) { + if (ovsCtBucketLock[i] != NULL) { + NdisFreeRWLock(ovsCtBucketLock[i]); + } + } + OvsFreeMemoryWithTag(ovsCtBucketLock, OVS_CT_POOL_TAG); + ovsCtBucketLock = NULL; +freeTable: + OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG); + ovsConntrackTable = NULL; + NdisFreeRWLock(ovsCtNatLockObj); + ovsCtNatLockObj = NULL; + return status; } /* @@ -124,12 +135,9 @@ OvsInitConntrack(POVS_SWITCH_CONTEXT context) VOID OvsCleanupConntrack(VOID) { - LOCK_STATE_EX lockState, lockStateNat; - NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); + LOCK_STATE_EX lockStateNat; ctThreadCtx.exit = 1; KeSetEvent(&ctThreadCtx.event, 0, FALSE); - NdisReleaseRWLock(ovsConntrackLockObj, &lockState); - KeWaitForSingleObject(ctThreadCtx.threadObject, Executive, KernelMode, FALSE, NULL); ObDereferenceObject(ctThreadCtx.threadObject); @@ -142,8 +150,14 @@ OvsCleanupConntrack(VOID) ovsConntrackTable = NULL; } - NdisFreeRWLock(ovsConntrackLockObj); - ovsConntrackLockObj = NULL; + for (UINT32 i = 0; i < CT_HASH_TABLE_SIZE; i++) { + if (ovsCtBucketLock[i] != NULL) { + NdisFreeRWLock(ovsCtBucketLock[i]); + } + } + OvsFreeMemoryWithTag(ovsCtBucketLock, OVS_CT_POOL_TAG); + ovsCtBucketLock = NULL; + NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatCleanup(); NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); @@ -179,11 +193,20 @@ OvsCtUpdateFlowKey(struct OvsFlowKey *key, } } +/* + *---------------------------------------------------------------------------- + * OvsPostCtEventEntry + * Assumes ct entry lock is acquired + * XXX Refactor OvsPostCtEvent() as it does not require ct entry lock. + *---------------------------------------------------------------------------- + */ static __inline VOID OvsPostCtEventEntry(POVS_CT_ENTRY entry, UINT8 type) { OVS_CT_EVENT_ENTRY ctEventEntry = {0}; NdisMoveMemory(&ctEventEntry.entry, entry, sizeof(OVS_CT_ENTRY)); + ctEventEntry.entry.lock = NULL; + ctEventEntry.entry.parent = NULL; ctEventEntry.type = type; OvsPostCtEvent(&ctEventEntry); } @@ -191,6 +214,8 @@ OvsPostCtEventEntry(POVS_CT_ENTRY entry, UINT8 type) static __inline VOID OvsCtIncrementCounters(POVS_CT_ENTRY entry, BOOLEAN reply, PNET_BUFFER_LIST nbl) { + LOCK_STATE_EX lockState; + NdisAcquireRWLockWrite(entry->lock, &lockState, 0); if (reply) { entry->rev_key.byteCount+= OvsPacketLenNBL(nbl); entry->rev_key.packetCount++; @@ -198,12 +223,15 @@ OvsCtIncrementCounters(POVS_CT_ENTRY entry, BOOLEAN reply, PNET_BUFFER_LIST nbl) entry->key.byteCount += OvsPacketLenNBL(nbl); entry->key.packetCount++; } + NdisReleaseRWLock(entry->lock, &lockState); } static __inline BOOLEAN -OvsCtAddEntry(POVS_CT_ENTRY entry, OvsConntrackKeyLookupCtx *ctx, +OvsCtAddEntry(POVS_SWITCH_CONTEXT switchContext, POVS_CT_ENTRY entry, + OvsConntrackKeyLookupCtx *ctx, PNAT_ACTION_INFO natInfo, UINT64 now) { + LOCK_STATE_EX lockState; NdisMoveMemory(&entry->key, &ctx->key, sizeof(OVS_CT_KEY)); NdisMoveMemory(&entry->rev_key, &ctx->key, sizeof(OVS_CT_KEY)); OvsCtKeyReverse(&entry->rev_key); @@ -230,10 +258,19 @@ OvsCtAddEntry(POVS_CT_ENTRY entry, OvsConntrackKeyLookupCtx *ctx, } entry->timestampStart = now; - InsertHeadList(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK], + entry->lock = NdisAllocateRWLock(switchContext->NdisFilterHandle); + if (entry->lock == NULL) { + OVS_LOG_ERROR("NdisAllocateRWLock failed : Insufficient resources"); + return FALSE; + } + UINT32 bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; + entry->bucketLockRef = ovsCtBucketLock[bucketIdx]; + NdisAcquireRWLockWrite(ovsCtBucketLock[bucketIdx], &lockState, 0); + InsertHeadList(&ovsConntrackTable[bucketIdx], &entry->link); InterlockedIncrement((LONG volatile *)&ctTotalEntries); + NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockState); return TRUE; } @@ -255,7 +292,6 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, *entryCreated = FALSE; state |= OVS_CS_F_NEW; - switch (ipProto) { case IPPROTO_TCP: { @@ -303,11 +339,11 @@ OvsCtEntryCreate(OvsForwardingContext *fwdCtx, if (parentEntry != NULL && state != OVS_CS_F_INVALID) { state |= OVS_CS_F_RELATED; } - if (state != OVS_CS_F_INVALID && commit) { if (entry) { entry->parent = parentEntry; - if (OvsCtAddEntry(entry, ctx, natInfo, currentTime)) { + if (OvsCtAddEntry(fwdCtx->switchContext, entry, ctx, + natInfo, currentTime)) { *entryCreated = TRUE; } else { /* Unable to add entry to the list */ @@ -337,6 +373,8 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, UINT64 now) { CT_UPDATE_RES status; + LOCK_STATE_EX lockState; + NdisAcquireRWLockWrite(entry->lock, &lockState, 0); switch (ipProto) { case IPPROTO_TCP: { @@ -344,25 +382,32 @@ OvsCtUpdateEntry(OVS_CT_ENTRY* entry, const TCPHdr *tcp; tcp = OvsGetTcp(nbl, l4Offset, &tcpStorage); if (!tcp) { - status = CT_UPDATE_INVALID; + status = CT_UPDATE_INVALID; break; } - status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); + status = OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now); break; } case IPPROTO_ICMP: - status = OvsConntrackUpdateIcmpEntry(entry, reply, now); + status = OvsConntrackUpdateIcmpEntry(entry, reply, now); break; case IPPROTO_UDP: - status = OvsConntrackUpdateOtherEntry(entry, reply, now); + status = OvsConntrackUpdateOtherEntry(entry, reply, now); break; default: - status = CT_UPDATE_INVALID; + status = CT_UPDATE_INVALID; break; } + NdisReleaseRWLock(entry->lock, &lockState); return status; } +/* + *---------------------------------------------------------------------------- + * OvsCtEntryExpired + * Assumes ct entry lock is acquired + *---------------------------------------------------------------------------- + */ static __inline BOOLEAN OvsCtEntryExpired(POVS_CT_ENTRY entry) { @@ -377,6 +422,8 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) if (entry == NULL) { return; } + LOCK_STATE_EX lockState; + NdisAcquireRWLockWrite(entry->lock, &lockState, 0); if (forceDelete || OvsCtEntryExpired(entry)) { if (entry->natInfo.natAction) { LOCK_STATE_EX lockStateNat; @@ -386,10 +433,13 @@ OvsCtEntryDelete(POVS_CT_ENTRY entry, BOOLEAN forceDelete) } OvsPostCtEventEntry(entry, OVS_EVENT_CT_DELETE); RemoveEntryList(&entry->link); + NdisReleaseRWLock(entry->lock, &lockState); + NdisFreeRWLock(entry->lock); OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG); InterlockedDecrement((LONG volatile*)&ctTotalEntries); return; } + NdisReleaseRWLock(entry->lock, &lockState); } static __inline NDIS_STATUS @@ -440,7 +490,8 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) POVS_CT_ENTRY entry; BOOLEAN reply = FALSE; POVS_CT_ENTRY found = NULL; - + LOCK_STATE_EX lockState, lockStateTable; + UINT32 bucketIdx; /* Reverse NAT must be performed before OvsCtLookup, so here * we simply need to flip the src and dst in key and compare * they are equal. Note that flipped key is not equal to @@ -452,10 +503,11 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) if (!ctTotalEntries) { return found; } - - LIST_FORALL(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK], link) { + bucketIdx = ctx->hash & CT_HASH_TABLE_MASK; + NdisAcquireRWLockRead(ovsCtBucketLock[bucketIdx], &lockStateTable, 0); + LIST_FORALL(&ovsConntrackTable[bucketIdx], link) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); - + NdisAcquireRWLockRead(entry->lock, &lockState, 0); if (OvsCtKeyAreSame(ctx->key, entry->key)) { found = entry; reply = FALSE; @@ -472,10 +524,13 @@ OvsCtLookup(OvsConntrackKeyLookupCtx *ctx) } else { ctx->reply = reply; } + NdisReleaseRWLock(entry->lock, &lockState); break; } + NdisReleaseRWLock(entry->lock, &lockState); } + NdisReleaseRWLock(ovsCtBucketLock[bucketIdx], &lockStateTable); ctx->entry = found; return found; } @@ -625,6 +680,8 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, POVS_CT_ENTRY entry = ctx->entry; UINT32 state = 0; PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; + LOCK_STATE_EX lockState, lockStateTable; + PNDIS_RW_LOCK_EX bucketLockRef = NULL; *entryCreated = FALSE; /* If an entry was found, update the state based on TCP flags */ @@ -649,7 +706,10 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, break; case CT_UPDATE_NEW: //Delete and update the Conntrack + bucketLockRef = entry->bucketLockRef; + NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); OvsCtEntryDelete(ctx->entry, TRUE); + NdisReleaseRWLock(bucketLockRef, &lockStateTable); ctx->entry = NULL; entry = OvsCtEntryCreate(fwdCtx, key->ipKey.nwProto, l4Offset, ctx, key, natInfo, commit, currentTime, @@ -660,25 +720,26 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, break; } } - - if (key->ipKey.nwProto == IPPROTO_TCP && entry) { - /* Update the related bit if there is a parent */ - if (entry->parent) { - state |= OVS_CS_F_RELATED; - } else { - POVS_CT_ENTRY parentEntry; - parentEntry = OvsCtRelatedLookup(ctx->key, currentTime); - entry->parent = parentEntry; - if (parentEntry != NULL) { + if (entry) { + NdisAcquireRWLockRead(entry->lock, &lockState, 0); + if (key->ipKey.nwProto == IPPROTO_TCP) { + /* Update the related bit if there is a parent */ + if (entry->parent) { state |= OVS_CS_F_RELATED; + } else { + POVS_CT_ENTRY parentEntry; + parentEntry = OvsCtRelatedLookup(ctx->key, currentTime); + entry->parent = parentEntry; + if (parentEntry != NULL) { + state |= OVS_CS_F_RELATED; + } } } - } - /* Copy mark and label from entry into flowKey. If actions specify - different mark and label, update the flowKey. */ - if (entry != NULL) { + /* Copy mark and label from entry into flowKey. If actions specify + different mark and label, update the flowKey. */ OvsCtUpdateFlowKey(key, state, zone, entry->mark, &entry->labels); + NdisReleaseRWLock(entry->lock, &lockState); } else { OvsCtUpdateFlowKey(key, state, zone, 0, NULL); } @@ -732,6 +793,8 @@ OvsCtSetMarkLabel(OvsFlowKey *key, MD_LABELS *labels, BOOLEAN *triggerUpdateEvent) { + LOCK_STATE_EX lockState; + NdisAcquireRWLockWrite(entry->lock, &lockState, 0); if (mark) { OvsConntrackSetMark(key, entry, mark->value, mark->mask, triggerUpdateEvent); @@ -741,8 +804,15 @@ OvsCtSetMarkLabel(OvsFlowKey *key, OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask, triggerUpdateEvent); } + NdisReleaseRWLock(entry->lock, &lockState); } +/* + *---------------------------------------------------------------------------- + * OvsCtUpdateTuple + * Assumes ct entry lock is acquired + *---------------------------------------------------------------------------- + */ static __inline void OvsCtUpdateTuple(OvsFlowKey *key, OVS_CT_KEY *ctKey) { @@ -778,23 +848,23 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, POVS_CT_ENTRY entry = NULL; PNET_BUFFER_LIST curNbl = fwdCtx->curNbl; OvsConntrackKeyLookupCtx ctx = { 0 }; - LOCK_STATE_EX lockState; + LOCK_STATE_EX lockState, lockStateTable; UINT64 currentTime; NdisGetCurrentSystemTime((LARGE_INTEGER *) ¤tTime); - /* Retrieve the Conntrack Key related fields from packet */ OvsCtSetupLookupCtx(key, zone, &ctx, curNbl, layers->l4Offset); - NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); - /* Lookup Conntrack entries for a matching entry */ entry = OvsCtLookup(&ctx); BOOLEAN entryCreated = FALSE; /* Delete entry in reverse direction if 'force' is specified */ if (entry && force && ctx.reply) { + PNDIS_RW_LOCK_EX bucketLockRef = entry->bucketLockRef; + NdisAcquireRWLockWrite(bucketLockRef, &lockStateTable, 0); OvsCtEntryDelete(entry, TRUE); + NdisReleaseRWLock(bucketLockRef, &lockStateTable); entry = NULL; } @@ -803,7 +873,6 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, * This blocks only new entries from being created and doesn't * affect existing connections. */ - NdisReleaseRWLock(ovsConntrackLockObj, &lockState); OVS_LOG_ERROR("Conntrack Limit hit: %lu", ctTotalEntries); return NDIS_STATUS_RESOURCES; } @@ -831,6 +900,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, if (entry == NULL) { return status; } + /* * Note that natInfo is not the same as entry->natInfo here. natInfo * is decided by action in the openflow rule, entry->natInfo is decided @@ -859,12 +929,15 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, OVS_LOG_ERROR("Error while parsing the FTP packet"); } } - + NdisAcquireRWLockRead(entry->lock, &lockState, 0); /* Add original tuple information to flow Key */ if (entry->key.dl_type == ntohs(ETH_TYPE_IPV4)) { if (entry->parent != NULL) { POVS_CT_ENTRY parent = entry->parent; + LOCK_STATE_EX lockStateParent; + NdisAcquireRWLockRead(parent->lock, &lockStateParent, 0); OvsCtUpdateTuple(key, &parent->key); + NdisReleaseRWLock(parent->lock, &lockStateParent); } else { OvsCtUpdateTuple(key, &entry->key); } @@ -877,7 +950,7 @@ OvsCtExecute_(OvsForwardingContext *fwdCtx, OvsPostCtEventEntry(entry, OVS_EVENT_CT_UPDATE); } - NdisReleaseRWLock(ovsConntrackLockObj, &lockState); + NdisReleaseRWLock(entry->lock, &lockState); return status; } @@ -1041,13 +1114,7 @@ OvsConntrackEntryCleaner(PVOID data) BOOLEAN success = TRUE; while (success) { - if (ovsConntrackLockObj == NULL) { - /* Lock has been freed by 'OvsCleanupConntrack()' */ - break; - } - NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); if (context->exit) { - NdisReleaseRWLock(ovsConntrackLockObj, &lockState); break; } @@ -1055,14 +1122,15 @@ OvsConntrackEntryCleaner(PVOID data) INT64 threadSleepTimeout = -CT_CLEANUP_INTERVAL; if (ctTotalEntries) { - for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) { + for (UINT32 i = 0; i < CT_HASH_TABLE_SIZE; i++) { + NdisAcquireRWLockWrite(ovsCtBucketLock[i], &lockState, 0); LIST_FORALL_SAFE(&ovsConntrackTable[i], link, next) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); OvsCtEntryDelete(entry, FALSE); } + NdisReleaseRWLock(ovsCtBucketLock[i], &lockState); } } - NdisReleaseRWLock(ovsConntrackLockObj, &lockState); KeWaitForSingleObject(&context->event, Executive, KernelMode, FALSE, (LARGE_INTEGER *)&threadSleepTimeout); } @@ -1081,13 +1149,12 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) { PLIST_ENTRY link, next; POVS_CT_ENTRY entry; - LOCK_STATE_EX lockState, lockStateNat; - NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0); if (ctTotalEntries) { for (UINT32 i = 0; i < CT_HASH_TABLE_SIZE; i++) { LIST_FORALL_SAFE(&ovsConntrackTable[i], link, next) { + NdisAcquireRWLockWrite(ovsCtBucketLock[i], &lockState, 0); entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); if (tuple) { if (tuple->ipv4_proto != IPPROTO_ICMP && @@ -1109,6 +1176,7 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) } else if (!zone || zone == entry->key.zone) { OvsCtEntryDelete(entry, TRUE); } + NdisReleaseRWLock(ovsCtBucketLock[i], &lockState); } } } @@ -1116,7 +1184,6 @@ OvsCtFlush(UINT16 zone, struct ovs_key_ct_tuple_ipv4 *tuple) NdisAcquireRWLockWrite(ovsCtNatLockObj, &lockStateNat, 0); OvsNatFlush(zone); NdisReleaseRWLock(ovsCtNatLockObj, &lockStateNat); - NdisReleaseRWLock(ovsConntrackLockObj, &lockState); return NDIS_STATUS_SUCCESS; } @@ -1620,7 +1687,6 @@ OvsCreateNlMsgFromCtEntry(POVS_CT_ENTRY entry, nlMsg = (PNL_MSG_HDR)NlBufAt(&nlBuf, 0, 0); nlMsg->nlmsgLen = NlBufSize(&nlBuf); - return STATUS_SUCCESS; } @@ -1663,12 +1729,11 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, UINT32 i = CT_HASH_TABLE_SIZE; UINT32 outIndex = 0; - LOCK_STATE_EX lockState; - NdisAcquireRWLockRead(ovsConntrackLockObj, &lockState, 0); - + LOCK_STATE_EX lockState, lockStateTable; if (ctTotalEntries) { for (i = inBucket; i < CT_HASH_TABLE_SIZE; i++) { PLIST_ENTRY head, link; + NdisAcquireRWLockRead(ovsCtBucketLock[i], &lockStateTable, 0); head = &ovsConntrackTable[i]; POVS_CT_ENTRY entry = NULL; @@ -1681,7 +1746,7 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, */ if (outIndex >= inIndex) { entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link); - + NdisAcquireRWLockRead(entry->lock, &lockState, 0); rc = OvsCreateNlMsgFromCtEntry(entry, usrParamsCtx->outputBuffer, usrParamsCtx->outputLength, @@ -1690,9 +1755,9 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, msgIn->nlMsg.nlmsgPid, msgIn->nfGenMsg.version, 0); - + NdisReleaseRWLock(entry->lock, &lockState); if (rc != NDIS_STATUS_SUCCESS) { - NdisReleaseRWLock(ovsConntrackLockObj, &lockState); + NdisReleaseRWLock(ovsCtBucketLock[i], &lockStateTable); return STATUS_UNSUCCESSFUL; } @@ -1702,7 +1767,7 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, ++outIndex; } - + NdisReleaseRWLock(ovsCtBucketLock[i], &lockStateTable); if (entry) { break; } @@ -1716,8 +1781,6 @@ OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, } instance->dumpState.index[0] = i; instance->dumpState.index[1] = outIndex; - NdisReleaseRWLock(ovsConntrackLockObj, &lockState); - /* if i < CT_HASH_TABLE_SIZE => entry was found */ if (i < CT_HASH_TABLE_SIZE) { POVS_MESSAGE msgOut = (POVS_MESSAGE)usrParamsCtx->outputBuffer; diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h index 35075db..3be309e 100644 --- a/datapath-windows/ovsext/Conntrack.h +++ b/datapath-windows/ovsext/Conntrack.h @@ -99,6 +99,9 @@ typedef struct _NAT_ACTION_INFO { } NAT_ACTION_INFO, *PNAT_ACTION_INFO; typedef struct OVS_CT_ENTRY { + /* Reference to ovsCtBucketLock of ovsConntrackTable.*/ + PNDIS_RW_LOCK_EX bucketLockRef; + PNDIS_RW_LOCK_EX lock; /* Protects OVS_CT_ENTRY. */ OVS_CT_KEY key; OVS_CT_KEY rev_key; UINT64 expiration;