From patchwork Mon Mar 29 14:12:43 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Timo Teras X-Patchwork-Id: 48852 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 288E2B7CD5 for ; Tue, 30 Mar 2010 01:13:15 +1100 (EST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752231Ab0C2ONJ (ORCPT ); Mon, 29 Mar 2010 10:13:09 -0400 Received: from mail-ew0-f220.google.com ([209.85.219.220]:39800 "EHLO mail-ew0-f220.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752792Ab0C2ONF (ORCPT ); Mon, 29 Mar 2010 10:13:05 -0400 Received: by mail-ew0-f220.google.com with SMTP id 20so705352ewy.1 for ; Mon, 29 Mar 2010 07:13:04 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:received:received:sender:from:to:cc:subject :date:message-id:x-mailer:in-reply-to:references; bh=VcJa3JDD83bwYSIRLy/Z5HwyOs6NDMw4Yo04ivpeFbE=; b=nzSF/xyDdODNigfY4Vz7j6qyv5CNjHmk02A3eC2fLwuwidIk9yDo5SDWzuYFh9LoRo bk7WtEpB0bMCecboM3m82EOJp9BZEv4hK7F5Rgo4FlI5pUrhidMAbnCc0ejlCk+N9q/X ZHQlDymTQ+AojCZPU/HUmz/Gqfxd5QyR5cHIs= DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=sender:from:to:cc:subject:date:message-id:x-mailer:in-reply-to :references; b=RbeoCvm60FnsGjyz8I9DxUr/idpnIYfBVpOeF0u23J1K6PBImPM34P6z+RES5YS1pX n8a8+03EjnyUIQvJdXXFjCtFoGksFECHITzciFdqFiPd8Ut57WSgzuCipYKbROgQvNZx WApLs0vEtwA77Kca2W32SJZjGhyV1uo+m3020= Received: by 10.213.42.138 with SMTP id s10mr2541008ebe.65.1269871984183; Mon, 29 Mar 2010 07:13:04 -0700 (PDT) Received: from localhost.localdomain (letku109.adsl.netsonic.fi [194.29.195.109]) by mx.google.com with ESMTPS id 16sm2303968ewy.11.2010.03.29.07.13.02 (version=SSLv3 cipher=RC4-MD5); Mon, 29 Mar 2010 07:13:03 -0700 (PDT) From: Timo Teras To: netdev@vger.kernel.org Cc: Herbert Xu , Timo Teras Subject: [PATCH 6/7] xfrm: cache bundles instead of policies for outgoing flows Date: Mon, 29 Mar 2010 17:12:43 +0300 Message-Id: <1269871964-5412-7-git-send-email-timo.teras@iki.fi> X-Mailer: git-send-email 1.6.3.3 In-Reply-To: <1269871964-5412-1-git-send-email-timo.teras@iki.fi> References: <1269871964-5412-1-git-send-email-timo.teras@iki.fi> Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org __xfrm_lookup() is called for each packet transmitted out of system. The xfrm_find_bundle() does a linear search which can kill system performance depending on how many bundles are required per policy. This modifies __xfrm_lookup() to store bundles directly in the flow cache. If we did not get a hit, we just create a new bundle instead of doing slow search. This means that we can now get multiple xfrm_dst's for same flow (on per-cpu basis). BUGGY: Currently leaks bundles if a per-socket policy requires a bundle. Signed-off-by: Timo Teras --- include/net/flow.h | 6 +- include/net/xfrm.h | 10 +- net/core/flow.c | 4 +- net/ipv4/xfrm4_policy.c | 22 -- net/ipv6/xfrm6_policy.c | 31 -- net/xfrm/xfrm_policy.c | 697 ++++++++++++++++++++++++----------------------- 6 files changed, 370 insertions(+), 400 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index f462325..9e97dfa 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -94,12 +94,12 @@ struct flow_cache_entry_ops { }; typedef struct flow_cache_entry_ops **(*flow_resolve_t)( - struct net *net, struct flowi *key, u16 family, - u8 dir, struct flow_cache_entry_ops **old_ops); + struct net *net, struct flowi *key, u16 family, u8 dir, + struct flow_cache_entry_ops **old_ops, void *ctx); extern struct flow_cache_entry_ops **flow_cache_lookup( struct net *net, struct flowi *key, u16 family, - u8 dir, flow_resolve_t resolver); + u8 dir, flow_resolve_t resolver, void *ctx); extern void flow_cache_flush(void); extern atomic_t flow_cache_genid; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index cb8934b..6ce593f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -267,7 +267,6 @@ struct xfrm_policy_afinfo { xfrm_address_t *saddr, xfrm_address_t *daddr); int (*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr); - struct dst_entry *(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy); void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); @@ -483,13 +482,13 @@ struct xfrm_policy { struct timer_list timer; struct flow_cache_entry_ops *fc_ops; + atomic_t genid; u32 priority; u32 index; struct xfrm_mark mark; struct xfrm_selector selector; struct xfrm_lifetime_cfg lft; struct xfrm_lifetime_cur curlft; - struct dst_entry *bundles; struct xfrm_policy_walk_entry walk; u8 type; u8 action; @@ -879,11 +878,15 @@ struct xfrm_dst { struct rt6_info rt6; } u; struct dst_entry *route; + struct flow_cache_entry_ops *fc_ops; + struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; + int num_pols, num_xfrms; #ifdef CONFIG_XFRM_SUB_POLICY struct flowi *origin; struct xfrm_selector *partner; #endif - u32 genid; + u32 xfrm_genid; + u32 policy_genid; u32 route_mtu_cached; u32 child_mtu_cached; u32 route_cookie; @@ -893,6 +896,7 @@ struct xfrm_dst { #ifdef CONFIG_XFRM static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) { + xfrm_pols_put(xdst->pols, xdst->num_pols); dst_release(xdst->route); if (likely(xdst->u.dst.xfrm)) xfrm_state_put(xdst->u.dst.xfrm); diff --git a/net/core/flow.c b/net/core/flow.c index 0245455..d3a27cc 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -208,7 +208,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) struct flow_cache_entry_ops **flow_cache_lookup( struct net *net, struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver) + flow_resolve_t resolver, void *ctx) { struct flow_cache *fc = &flow_cache_global; struct flow_cache_percpu *fcp; @@ -279,7 +279,7 @@ struct flow_cache_entry_ops **flow_cache_lookup( } nocache: - ops = resolver(net, key, family, dir, ops); + ops = resolver(net, key, family, dir, ops, ctx); if (fle) { fle->genid = atomic_read(&flow_cache_genid); if (IS_ERR(ops)) { diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e4a1483..1705476 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -59,27 +59,6 @@ static int xfrm4_get_saddr(struct net *net, return 0; } -static struct dst_entry * -__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) -{ - struct dst_entry *dst; - - read_lock_bh(&policy->lock); - for (dst = policy->bundles; dst; dst = dst->next) { - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ - xdst->u.rt.fl.fl4_dst == fl->fl4_dst && - xdst->u.rt.fl.fl4_src == fl->fl4_src && - xdst->u.rt.fl.fl4_tos == fl->fl4_tos && - xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) { - dst_clone(dst); - break; - } - } - read_unlock_bh(&policy->lock); - return dst; -} - static int xfrm4_get_tos(struct flowi *fl) { return fl->fl4_tos; @@ -259,7 +238,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .dst_ops = &xfrm4_dst_ops, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, - .find_bundle = __xfrm4_find_bundle, .decode_session = _decode_session4, .get_tos = xfrm4_get_tos, .init_path = xfrm4_init_path, diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ae18165..8c452fd 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -67,36 +67,6 @@ static int xfrm6_get_saddr(struct net *net, return 0; } -static struct dst_entry * -__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) -{ - struct dst_entry *dst; - - /* Still not clear if we should set fl->fl6_{src,dst}... */ - read_lock_bh(&policy->lock); - for (dst = policy->bundles; dst; dst = dst->next) { - struct xfrm_dst *xdst = (struct xfrm_dst*)dst; - struct in6_addr fl_dst_prefix, fl_src_prefix; - - ipv6_addr_prefix(&fl_dst_prefix, - &fl->fl6_dst, - xdst->u.rt6.rt6i_dst.plen); - ipv6_addr_prefix(&fl_src_prefix, - &fl->fl6_src, - xdst->u.rt6.rt6i_src.plen); - if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) && - ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) && - xfrm_bundle_ok(policy, xdst, fl, AF_INET6, - (xdst->u.rt6.rt6i_dst.plen != 128 || - xdst->u.rt6.rt6i_src.plen != 128))) { - dst_clone(dst); - break; - } - } - read_unlock_bh(&policy->lock); - return dst; -} - static int xfrm6_get_tos(struct flowi *fl) { return 0; @@ -291,7 +261,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, - .find_bundle = __xfrm6_find_bundle, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, .init_path = xfrm6_init_path, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cd9f2bc..11fb48d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -50,6 +50,7 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock); static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); static void xfrm_init_pmtu(struct dst_entry *dst); +static int stale_bundle(struct dst_entry *dst); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); @@ -278,8 +279,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); - BUG_ON(policy->bundles); - if (del_timer(&policy->timer)) BUG(); @@ -290,12 +289,7 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_gc_kill(struct xfrm_policy *policy) { - struct dst_entry *dst; - - while ((dst = policy->bundles) != NULL) { - policy->bundles = dst->next; - dst_free(dst); - } + atomic_inc(&policy->genid); if (del_timer(&policy->timer)) atomic_dec(&policy->refcnt); @@ -573,7 +567,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct xfrm_policy *delpol; struct hlist_head *chain; struct hlist_node *entry, *newpos; - struct dst_entry *gc_list; u32 mark = policy->mark.v & policy->mark.m; write_lock_bh(&xfrm_policy_lock); @@ -624,33 +617,11 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) schedule_work(&net->xfrm.policy_hash_work); read_lock_bh(&xfrm_policy_lock); - gc_list = NULL; entry = &policy->bydst; - hlist_for_each_entry_continue(policy, entry, bydst) { - struct dst_entry *dst; - - write_lock(&policy->lock); - dst = policy->bundles; - if (dst) { - struct dst_entry *tail = dst; - while (tail->next) - tail = tail->next; - tail->next = gc_list; - gc_list = dst; - - policy->bundles = NULL; - } - write_unlock(&policy->lock); - } + hlist_for_each_entry_continue(policy, entry, bydst) + atomic_inc(&policy->genid); read_unlock_bh(&xfrm_policy_lock); - while (gc_list) { - struct dst_entry *dst = gc_list; - - gc_list = dst->next; - dst_free(dst); - } - return 0; } EXPORT_SYMBOL(xfrm_policy_insert); @@ -999,30 +970,35 @@ fail: return ret; } +static struct xfrm_policy *__xfrm_policy_lookup( + struct net *net, struct flowi *fl, + u16 family, u8 dir) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_policy *pol; + + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); + if (pol != NULL) + return pol; +#endif + return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); +} + static struct flow_cache_entry_ops **xfrm_policy_lookup( struct net *net, struct flowi *fl, u16 family, - u8 dir, struct flow_cache_entry_ops **old_ops) + u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx) { struct xfrm_policy *pol; if (old_ops) xfrm_pol_put(container_of(old_ops, struct xfrm_policy, fc_ops)); -#ifdef CONFIG_XFRM_SUB_POLICY - pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); + pol = __xfrm_policy_lookup(net, fl, family, dir); if (IS_ERR(pol)) return (void *) pol; - if (pol) - goto found; -#endif - pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); - if (IS_ERR(pol)) - return (void *) pol; - if (pol) - goto found; - return NULL; + if (!pol) + return NULL; -found: /* Resolver returns two references: * one for cache and one for caller of flow_cache_lookup() */ xfrm_pol_hold(pol); @@ -1311,18 +1287,6 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl, * still valid. */ -static struct dst_entry * -xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family) -{ - struct dst_entry *x; - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - if (unlikely(afinfo == NULL)) - return ERR_PTR(-EINVAL); - x = afinfo->find_bundle(fl, policy); - xfrm_policy_put_afinfo(afinfo); - return x; -} - static inline int xfrm_get_tos(struct flowi *fl, int family) { struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); @@ -1338,6 +1302,53 @@ static inline int xfrm_get_tos(struct flowi *fl, int family) return tos; } +static struct flow_cache_entry_ops **xfrm_bundle_get_fce( + struct flow_cache_entry_ops **ops) +{ + struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops); + struct dst_entry *dst = (struct dst_entry*) xdst; + + if (xdst->route == NULL) { + /* Dummy bundle - if it has xfrms we were not + * able to build bundle as template resolution failed. + * It means we need to try again resolving. */ + if (xdst->num_xfrms > 0) + return NULL; + } else { + /* Real bundle */ + if (stale_bundle(dst)) + return NULL; + } + + dst_hold(dst); + return ops; +} + +static int xfrm_bundle_check_fce(struct flow_cache_entry_ops **ops) +{ + struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops); + struct dst_entry *dst = (struct dst_entry*) xdst; + + if (xdst->route && stale_bundle(dst)) + return 0; + + return 1; +} + +static void xfrm_bundle_delete_fce(struct flow_cache_entry_ops **ops) +{ + struct xfrm_dst *xdst = container_of(ops, struct xfrm_dst, fc_ops); + struct dst_entry *dst = (struct dst_entry*) xdst; + + dst_free(dst); +} + +static struct flow_cache_entry_ops xfrm_bundle_fc_ops __read_mostly = { + .get = xfrm_bundle_get_fce, + .check = xfrm_bundle_check_fce, + .delete = xfrm_bundle_delete_fce, +}; + static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); @@ -1360,9 +1371,10 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) BUG(); } xdst = dst_alloc(dst_ops) ?: ERR_PTR(-ENOBUFS); - xfrm_policy_put_afinfo(afinfo); + xdst->fc_ops = &xfrm_bundle_fc_ops; + return xdst; } @@ -1400,6 +1412,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, return err; } + /* Allocate chain of dst_entry's, attach known xfrm's, calculate * all the metrics... Shortly, bundle a bundle. */ @@ -1463,7 +1476,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst_hold(dst); dst1->xfrm = xfrm[i]; - xdst->genid = xfrm[i]->genid; + xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = -1; dst1->flags |= DST_HOST; @@ -1556,7 +1569,187 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl) #endif } -static int stale_bundle(struct dst_entry *dst); +static int xfrm_resolve_policies(struct xfrm_policy *main_policy, + struct flowi *fl, u16 family, + struct xfrm_policy **pols, + int *num_pols, int *num_xfrms) +{ + int i; + + if (!main_policy) { + *num_pols = 0; + *num_xfrms = 0; + return 0; + } + if (IS_ERR(main_policy)) + return PTR_ERR(main_policy); + + pols[0] = main_policy; + *num_pols = 1; + *num_xfrms = pols[0]->xfrm_nr; + +#ifdef CONFIG_XFRM_SUB_POLICY + if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW && + pols[0]->type != XFRM_POLICY_TYPE_MAIN) { + pols[1] = xfrm_policy_lookup_bytype(xp_net(main_policy), + XFRM_POLICY_TYPE_MAIN, + fl, family, + XFRM_POLICY_OUT); + if (pols[1]) { + if (IS_ERR(pols[1])) { + xfrm_pols_put(pols, *num_pols); + return PTR_ERR(pols[1]); + } + (*num_pols) ++; + (*num_xfrms) += pols[1]->xfrm_nr; + } + } +#endif + for (i = 0; i < *num_pols; i++) { + if (pols[i]->action != XFRM_POLICY_ALLOW) { + *num_xfrms = -1; + break; + } + } + + return 0; + +} + +static struct xfrm_dst *xfrm_resolve_and_create_bundle( + struct xfrm_policy **pols, int num_pols, + struct flowi *fl, u16 family, struct dst_entry *dst_orig) +{ + struct net *net = xp_net(pols[0]); + struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct dst_entry *dst; + struct xfrm_dst *xdst; + int err; + + /* Try to instantiate a bundle */ + err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); + if (err < 0) { + if (err != -EAGAIN) + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); + return ERR_PTR(err); + } + + dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig); + if (IS_ERR(dst)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); + return (void*) dst; + } + + xdst = (struct xfrm_dst *)dst; + xdst->num_xfrms = err; + if (num_pols > 1) + err = xfrm_dst_update_parent(dst, &pols[1]->selector); + else + err = xfrm_dst_update_origin(dst, fl); + if (unlikely(err)) { + dst_free(dst); + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); + return ERR_PTR(err); + } + + xdst->num_pols = num_pols; + memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols); + xdst->policy_genid = atomic_read(&pols[0]->genid); + + return xdst; +} + +static struct flow_cache_entry_ops **xfrm_bundle_lookup( + struct net *net, struct flowi *fl, u16 family, u8 dir, + struct flow_cache_entry_ops **old_ops, void *ctx) +{ + struct dst_entry *dst_orig = (struct dst_entry *)ctx; + struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; + struct xfrm_dst *xdst, *new_xdst; + int num_pols = 0, num_xfrms = 0, i, err, pol_dead; + + /* Check if the policies from old bundle are usable */ + xdst = NULL; + if (old_ops) { + xdst = container_of(old_ops, struct xfrm_dst, fc_ops); + num_pols = xdst->num_pols; + num_xfrms = xdst->num_xfrms; + pol_dead = 0; + for (i = 0; i < num_pols; i++) { + pols[i] = xdst->pols[i]; + pol_dead |= pols[i]->walk.dead; + } + if (pol_dead) { + dst_free((struct dst_entry*) xdst); + xdst = NULL; + num_pols = 0; + num_xfrms = 0; + old_ops = NULL; + } + } + + /* Resolve policies to use if we couldn't get them from + * previous cache entry */ + if (xdst == NULL) { + pols[0] = __xfrm_policy_lookup(net, fl, family, dir); + err = xfrm_resolve_policies(pols[0], fl, family, + pols, &num_pols, &num_xfrms); + if (err < 0) + goto inc_error; + if (num_pols == 0) + return NULL; + if (num_xfrms <= 0) + goto make_dummy_bundle; + } + + new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig); + if (IS_ERR(new_xdst)) { + err = PTR_ERR(new_xdst); + if (err != -EAGAIN) + goto error; + if (old_ops == NULL) + goto make_dummy_bundle; + dst_hold((struct dst_entry*) xdst); + return old_ops; + } + + /* Kill the previous bundle */ + if (xdst) { + /* The policies were stolen for newly generated bundle */ + xdst->num_pols = 0; + dst_free((struct dst_entry*)xdst); + } + + /* Flow cache does not have reference, it dst_free()'s, + * but we do need to return one reference for original caller */ + dst_hold((struct dst_entry*)new_xdst); + return &new_xdst->fc_ops; + +make_dummy_bundle: + /* We found policies, but there's no bundles to instantiate: + * either because the policy blocks, has no transformations or + * we could not build template (no xfrm_states).*/ + xdst = xfrm_alloc_dst(net, family); + if (IS_ERR(xdst)) { + xfrm_pols_put(pols, num_pols); + return (void*) xdst; + } + xdst->num_pols = num_pols; + xdst->num_xfrms = num_xfrms; + memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols); + + dst_hold((struct dst_entry*)xdst); + return &xdst->fc_ops; + +inc_error: + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); +error: + if (xdst != NULL) + dst_free((struct dst_entry*) xdst); + else + xfrm_pols_put(pols, num_pols); + return ERR_PTR(err); +} /* Main function: finds/creates a bundle for given flow. * @@ -1566,248 +1759,138 @@ static int stale_bundle(struct dst_entry *dst); int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, struct sock *sk, int flags) { - struct xfrm_policy *policy; - struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; - int npols; - int pol_dead; - int xfrm_nr; - int pi; - struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; - struct dst_entry *dst, *dst_orig = *dst_p; - int nx = 0; - int err; - u32 genid; - u16 family; + struct flow_cache_entry_ops **ops; + struct xfrm_dst *xdst; + struct xfrm_policy *pol; + struct dst_entry *dst, *dst_orig = *dst_p, *route; + u16 family = dst_orig->ops->family; u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + int err, num_pols, num_xfrms; restart: - genid = atomic_read(&flow_cache_genid); - policy = NULL; - for (pi = 0; pi < ARRAY_SIZE(pols); pi++) - pols[pi] = NULL; - npols = 0; - pol_dead = 0; - xfrm_nr = 0; + dst = NULL; + xdst = NULL; + route = NULL; if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { - policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); - err = PTR_ERR(policy); - if (IS_ERR(policy)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); + struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; + + pol = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + err = xfrm_resolve_policies(pol, fl, family, + pols, &num_pols, &num_xfrms); + if (err < 0) goto dropdst; + + if (num_pols) { + if (num_xfrms <= 0) + goto no_transform; + + xdst = xfrm_resolve_and_create_bundle( + pols, num_pols, fl, + family, dst_orig); + if (IS_ERR(xdst)) { + xfrm_pols_put(pols, num_pols); + err = PTR_ERR(xdst); + goto dropdst; + } + route = xdst->route; } } - if (!policy) { - struct flow_cache_entry_ops **ops; - + if (xdst == NULL) { /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; - ops = flow_cache_lookup(net, fl, dst_orig->ops->family, - dir, xfrm_policy_lookup); - err = PTR_ERR(ops); + ops = flow_cache_lookup(net, fl, family, dir, + xfrm_bundle_lookup, dst_orig); + if (ops == NULL) + goto nopol; if (IS_ERR(ops)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); + err = PTR_ERR(ops); goto dropdst; } - if (ops) - policy = container_of(ops, struct xfrm_policy, fc_ops); - else - policy = NULL; - } + xdst = container_of(ops, struct xfrm_dst, fc_ops); + + num_pols = xdst->num_pols; + num_xfrms = xdst->num_xfrms; + pol = xdst->pols[0]; + route = xdst->route; + } + + dst = (struct dst_entry *) xdst; + if (route == NULL && num_xfrms > 0) { + dst_release(dst); + + /* The only case when xfrm_bundle_lookup() returns a + * bundle with null route, is when the template could + * not be resolved. It means policies are there, but + * bundle could not be created, since we don't yet + * have the xfrm_state's. We need to wait for KM to + * negotiate new SA's or bail out with error.*/ + if (net->xfrm.sysctl_larval_drop) { + /* EREMOTE tells the caller to generate + * a one-shot blackhole route. */ + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); + return -EREMOTE; + } + if (flags & XFRM_LOOKUP_WAIT) { + DECLARE_WAITQUEUE(wait, current); - if (!policy) - goto nopol; + add_wait_queue(&net->xfrm.km_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + set_current_state(TASK_RUNNING); + remove_wait_queue(&net->xfrm.km_waitq, &wait); - family = dst_orig->ops->family; - pols[0] = policy; - npols ++; - xfrm_nr += pols[0]->xfrm_nr; + if (!signal_pending(current)) + goto restart; - err = -ENOENT; - if ((flags & XFRM_LOOKUP_ICMP) && !(policy->flags & XFRM_POLICY_ICMP)) + err = -ERESTART; + } else + err = -EAGAIN; + + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); goto error; + } - policy->curlft.use_time = get_seconds(); +no_transform: + if (num_pols == 0) + goto nopol; - switch (policy->action) { - default: - case XFRM_POLICY_BLOCK: + if ((flags & XFRM_LOOKUP_ICMP) && + !(pol->flags & XFRM_POLICY_ICMP)) { + err = -ENOENT; + goto error; + } + + pol->curlft.use_time = get_seconds(); + if (num_xfrms < 0) { /* Prohibit the flow */ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; - - case XFRM_POLICY_ALLOW: -#ifndef CONFIG_XFRM_SUB_POLICY - if (policy->xfrm_nr == 0) { - /* Flow passes not transformed. */ - xfrm_pol_put(policy); - return 0; - } -#endif - - /* Try to find matching bundle. - * - * LATER: help from flow cache. It is optional, this - * is required only for output policy. - */ - dst = xfrm_find_bundle(fl, policy, family); - if (IS_ERR(dst)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); - err = PTR_ERR(dst); - goto error; - } - - if (dst) - break; - -#ifdef CONFIG_XFRM_SUB_POLICY - if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { - pols[1] = xfrm_policy_lookup_bytype(net, - XFRM_POLICY_TYPE_MAIN, - fl, family, - XFRM_POLICY_OUT); - if (pols[1]) { - if (IS_ERR(pols[1])) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); - err = PTR_ERR(pols[1]); - goto error; - } - if (pols[1]->action == XFRM_POLICY_BLOCK) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); - err = -EPERM; - goto error; - } - npols ++; - xfrm_nr += pols[1]->xfrm_nr; - } - } - - /* - * Because neither flowi nor bundle information knows about - * transformation template size. On more than one policy usage - * we can realize whether all of them is bypass or not after - * they are searched. See above not-transformed bypass - * is surrounded by non-sub policy configuration, too. - */ - if (xfrm_nr == 0) { - /* Flow passes not transformed. */ - xfrm_pols_put(pols, npols); - return 0; - } - -#endif - nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family); - - if (unlikely(nx<0)) { - err = nx; - if (err == -EAGAIN && net->xfrm.sysctl_larval_drop) { - /* EREMOTE tells the caller to generate - * a one-shot blackhole route. - */ - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); - xfrm_pol_put(policy); - return -EREMOTE; - } - if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) { - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&net->xfrm.km_waitq, &wait); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - set_current_state(TASK_RUNNING); - remove_wait_queue(&net->xfrm.km_waitq, &wait); - - nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family); - - if (nx == -EAGAIN && signal_pending(current)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); - err = -ERESTART; - goto error; - } - if (nx == -EAGAIN || - genid != atomic_read(&flow_cache_genid)) { - xfrm_pols_put(pols, npols); - goto restart; - } - err = nx; - } - if (err < 0) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); - goto error; - } - } - if (nx == 0) { - /* Flow passes not transformed. */ - xfrm_pols_put(pols, npols); - return 0; - } - - dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig); - err = PTR_ERR(dst); - if (IS_ERR(dst)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); - goto error; - } - - for (pi = 0; pi < npols; pi++) - pol_dead |= pols[pi]->walk.dead; - - write_lock_bh(&policy->lock); - if (unlikely(pol_dead || stale_bundle(dst))) { - /* Wow! While we worked on resolving, this - * policy has gone. Retry. It is not paranoia, - * we just cannot enlist new bundle to dead object. - * We can't enlist stable bundles either. - */ - write_unlock_bh(&policy->lock); - dst_free(dst); - - if (pol_dead) - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLDEAD); - else - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); - err = -EHOSTUNREACH; - goto error; - } - - if (npols > 1) - err = xfrm_dst_update_parent(dst, &pols[1]->selector); - else - err = xfrm_dst_update_origin(dst, fl); - if (unlikely(err)) { - write_unlock_bh(&policy->lock); - dst_free(dst); - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); - goto error; - } - - dst->next = policy->bundles; - policy->bundles = dst; - dst_hold(dst); - write_unlock_bh(&policy->lock); + } else if (num_xfrms > 0) { + /* Flow transformed */ + *dst_p = dst; + dst_release(dst_orig); + } else { + /* Flow passes untransformed */ + dst_release(dst); } - *dst_p = dst; - dst_release(dst_orig); - xfrm_pols_put(pols, npols); return 0; +nopol: + if (!(flags & XFRM_LOOKUP_ICMP)) + return 0; + err = -ENOENT; error: - xfrm_pols_put(pols, npols); + dst_release(dst); dropdst: dst_release(dst_orig); *dst_p = NULL; return err; - -nopol: - err = -ENOENT; - if (flags & XFRM_LOOKUP_ICMP) - goto dropdst; - return 0; } EXPORT_SYMBOL(__xfrm_lookup); @@ -1970,7 +2053,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, struct flow_cache_entry_ops **ops; ops = flow_cache_lookup(net, &fl, family, fl_dir, - xfrm_policy_lookup); + xfrm_policy_lookup, NULL); if (IS_ERR(ops)) pol = (void *) ops; else if (ops) @@ -2159,69 +2242,9 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) return dst; } -static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p) -{ - struct dst_entry *dst, **dstp; - - write_lock(&pol->lock); - dstp = &pol->bundles; - while ((dst=*dstp) != NULL) { - if (func(dst)) { - *dstp = dst->next; - dst->next = *gc_list_p; - *gc_list_p = dst; - } else { - dstp = &dst->next; - } - } - write_unlock(&pol->lock); -} - -static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *)) -{ - struct dst_entry *gc_list = NULL; - int dir; - - read_lock_bh(&xfrm_policy_lock); - for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { - struct xfrm_policy *pol; - struct hlist_node *entry; - struct hlist_head *table; - int i; - - hlist_for_each_entry(pol, entry, - &net->xfrm.policy_inexact[dir], bydst) - prune_one_bundle(pol, func, &gc_list); - - table = net->xfrm.policy_bydst[dir].table; - for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { - hlist_for_each_entry(pol, entry, table + i, bydst) - prune_one_bundle(pol, func, &gc_list); - } - } - read_unlock_bh(&xfrm_policy_lock); - - while (gc_list) { - struct dst_entry *dst = gc_list; - gc_list = dst->next; - dst_free(dst); - } -} - -static int unused_bundle(struct dst_entry *dst) -{ - return !atomic_read(&dst->__refcnt); -} - static void __xfrm_garbage_collect(struct net *net) { - xfrm_prune_bundles(net, unused_bundle); -} - -static int xfrm_flush_bundles(struct net *net) -{ - xfrm_prune_bundles(net, stale_bundle); - return 0; + flow_cache_flush(); } static void xfrm_init_pmtu(struct dst_entry *dst) @@ -2281,7 +2304,9 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, return 0; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; - if (xdst->genid != dst->xfrm->genid) + if (xdst->xfrm_genid != dst->xfrm->genid) + return 0; + if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; if (strict && fl && @@ -2442,11 +2467,9 @@ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; - switch (event) { case NETDEV_DOWN: - xfrm_flush_bundles(dev_net(dev)); + flow_cache_flush(); } return NOTIFY_DONE; } @@ -2778,7 +2801,6 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, struct xfrm_migrate *m, int num_migrate) { struct xfrm_migrate *mp; - struct dst_entry *dst; int i, j, n = 0; write_lock_bh(&pol->lock); @@ -2803,10 +2825,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, sizeof(pol->xfrm_vec[i].saddr)); pol->xfrm_vec[i].encap_family = mp->new_family; /* flush bundles */ - while ((dst = pol->bundles) != NULL) { - pol->bundles = dst->next; - dst_free(dst); - } + atomic_inc(&pol->genid); } }