Message ID | 1585252702-8649-1-git-send-email-pkusunyifeng@gmail.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,1/2] tun_metadata: Fix coredump caused by use-after-free bug | expand |
On Fri, Mar 27, 2020 at 3:58 AM Yifeng Sun <pkusunyifeng@gmail.com> wrote: > > Tun_metadata can be referened by flow and frozen_state at the same > time. When ovs-vswitchd handles TLV table mod message, the involved > tun_metadata gets freed. The call trace to free tun_metadata is > shown as below: > > ofproto_run > - handle_openflow > - handle_single_part_openflow > - handle_tlv_table_mod > - tun_metadata_table_mod > - tun_metadata_postpone_free > > Unfortunately, this tun_metadata can be still used by some frozen_state, > and later on when frozen_state tries to access its tun_metadata table, > ovs-vswitchd crashes. The call trace to access tun_metadata from > frozen_state is shown as below: > > udpif_upcall_handler > - recv_upcalls > - process_upcall > - frozen_metadata_to_flow > > This patch fixes it by introducing a reference count to tun_metadata. > Whenever a pointer of tun_metadata is passed between flow and > frozen_state, we increase its reference count. Reference count > is decreased at deallocation. > > In present code, pointer of tun_metadata can be passed between flows. > It is safe because of RCU mechanism. > > VMware-BZ: #2526222 > Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com> > --- > lib/tun-metadata.c | 29 ++++++++++++++++++++++++++++- > lib/tun-metadata.h | 2 ++ > ofproto/ofproto-dpif-rid.c | 8 ++++++++ > ofproto/ofproto-dpif-rid.h | 2 ++ > 4 files changed, 40 insertions(+), 1 deletion(-) > > diff --git a/lib/tun-metadata.c b/lib/tun-metadata.c > index f8a0e19524e9..c4218a034a92 100644 > --- a/lib/tun-metadata.c > +++ b/lib/tun-metadata.c > @@ -25,6 +25,7 @@ > #include "nx-match.h" > #include "odp-netlink.h" > #include "openvswitch/ofp-match.h" > +#include "ovs-atomic.h" > #include "ovs-rcu.h" > #include "packets.h" > #include "tun-metadata.h" > @@ -40,6 +41,11 @@ struct tun_meta_entry { > /* Maps from TLV option class+type to positions in a struct tun_metadata's > * 'opts' array. */ > struct tun_table { > + /* Struct tun_table can be referenced by struct frozen_state for a long > + * time. This ref_cnt protects tun_table from being freed if it is still > + * being used somewhere. */ > + struct ovs_refcount ref_cnt; > + > /* TUN_METADATA<i> is stored in element <i>. */ > struct tun_meta_entry entries[TUN_METADATA_NUM_OPTS]; > > @@ -79,6 +85,24 @@ tun_key_type(uint32_t key) > return key & 0xff; > } > > +void > +tun_metadata_ref(const struct tun_table *tab) > +{ > + if (tab) { > + ovs_refcount_ref(&CONST_CAST(struct tun_table *, tab)->ref_cnt); > + } > +} > + > +unsigned int > +tun_metadata_unref(const struct tun_table *tab) > +{ > + if (tab) { > + return ovs_refcount_unref_relaxed( > + &CONST_CAST(struct tun_table *, tab)->ref_cnt); In general, xxx_unref will free the struct data, for example, stp_unref/netdev_unref/lldp_unref. > + } > + return -1; > +} > + > /* Returns a newly allocated tun_table. If 'old_map' is nonnull then the new > * tun_table is a deep copy of the old one. */ > struct tun_table * > @@ -111,6 +135,7 @@ tun_metadata_alloc(const struct tun_table *old_map) > hmap_init(&new_map->key_hmap); > } > > + ovs_refcount_init(&new_map->ref_cnt); > return new_map; > } > > @@ -135,7 +160,9 @@ tun_metadata_free(struct tun_table *map) > void > tun_metadata_postpone_free(struct tun_table *tab) > { > - ovsrcu_postpone(tun_metadata_free, tab); > + if (tun_metadata_unref(tab) == 1) { > + ovsrcu_postpone(tun_metadata_free, tab); > + } > } > > enum ofperr > diff --git a/lib/tun-metadata.h b/lib/tun-metadata.h > index 7dad9504b8da..933021a0f679 100644 > --- a/lib/tun-metadata.h > +++ b/lib/tun-metadata.h > @@ -33,6 +33,8 @@ struct ofputil_tlv_table_mod; > struct ofputil_tlv_table_reply; > struct tun_table; > > +void tun_metadata_ref(const struct tun_table *tab); > +unsigned int tun_metadata_unref(const struct tun_table *tab); > struct tun_table *tun_metadata_alloc(const struct tun_table *old_map); > void tun_metadata_free(struct tun_table *); > void tun_metadata_postpone_free(struct tun_table *); > diff --git a/ofproto/ofproto-dpif-rid.c b/ofproto/ofproto-dpif-rid.c > index 29aafc2c0b40..d479e53d9b2d 100644 > --- a/ofproto/ofproto-dpif-rid.c > +++ b/ofproto/ofproto-dpif-rid.c > @@ -201,6 +201,7 @@ static void > frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) > { > *new = *old; > + tun_metadata_ref(old->metadata.tunnel.metadata.tab); > new->stack = (new->stack_size > ? xmemdup(new->stack, new->stack_size) > : NULL); > @@ -218,10 +219,17 @@ frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) > static void > frozen_state_free(struct frozen_state *state) > { > + struct tun_table *tab; > + > free(state->stack); > free(state->ofpacts); > free(state->action_set); > free(state->userdata); > + > + tab = CONST_CAST(struct tun_table *, state->metadata.tunnel.metadata.tab); > + if (tun_metadata_unref(tab) == 1) { > + tun_metadata_free(tab); > + } > } > > /* Allocate a unique recirculation id for the given set of flow metadata. > diff --git a/ofproto/ofproto-dpif-rid.h b/ofproto/ofproto-dpif-rid.h > index e5d02caf28a3..1fefbf53b94a 100644 > --- a/ofproto/ofproto-dpif-rid.h > +++ b/ofproto/ofproto-dpif-rid.h > @@ -115,6 +115,7 @@ frozen_metadata_from_flow(struct frozen_metadata *md, > { > memset(md, 0, sizeof *md); > md->tunnel = flow->tunnel; > + tun_metadata_ref(flow->tunnel.metadata.tab); should we ref it fristly, and then assign it. > md->metadata = flow->metadata; > memcpy(md->regs, flow->regs, sizeof md->regs); > md->in_port = flow->in_port.ofp_port; > @@ -125,6 +126,7 @@ frozen_metadata_to_flow(const struct frozen_metadata *md, > struct flow *flow) > { > flow->tunnel = md->tunnel; > + tun_metadata_ref(md->tunnel.metadata.tab); > flow->metadata = md->metadata; > memcpy(flow->regs, md->regs, sizeof flow->regs); > flow->in_port.ofp_port = md->in_port; > -- > 2.7.4 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Thanks Tonghao. tun_metadata_ref/unref follows the practice of ovs_refcount_ref/unref. In this patch, we need return value of tun_metadata_unref to decide the way to free it. Thanks, Yifeng On Fri, Mar 27, 2020 at 2:45 AM Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote: > > On Fri, Mar 27, 2020 at 3:58 AM Yifeng Sun <pkusunyifeng@gmail.com> wrote: > > > > Tun_metadata can be referened by flow and frozen_state at the same > > time. When ovs-vswitchd handles TLV table mod message, the involved > > tun_metadata gets freed. The call trace to free tun_metadata is > > shown as below: > > > > ofproto_run > > - handle_openflow > > - handle_single_part_openflow > > - handle_tlv_table_mod > > - tun_metadata_table_mod > > - tun_metadata_postpone_free > > > > Unfortunately, this tun_metadata can be still used by some frozen_state, > > and later on when frozen_state tries to access its tun_metadata table, > > ovs-vswitchd crashes. The call trace to access tun_metadata from > > frozen_state is shown as below: > > > > udpif_upcall_handler > > - recv_upcalls > > - process_upcall > > - frozen_metadata_to_flow > > > > This patch fixes it by introducing a reference count to tun_metadata. > > Whenever a pointer of tun_metadata is passed between flow and > > frozen_state, we increase its reference count. Reference count > > is decreased at deallocation. > > > > In present code, pointer of tun_metadata can be passed between flows. > > It is safe because of RCU mechanism. > > > > VMware-BZ: #2526222 > > Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com> > > --- > > lib/tun-metadata.c | 29 ++++++++++++++++++++++++++++- > > lib/tun-metadata.h | 2 ++ > > ofproto/ofproto-dpif-rid.c | 8 ++++++++ > > ofproto/ofproto-dpif-rid.h | 2 ++ > > 4 files changed, 40 insertions(+), 1 deletion(-) > > > > diff --git a/lib/tun-metadata.c b/lib/tun-metadata.c > > index f8a0e19524e9..c4218a034a92 100644 > > --- a/lib/tun-metadata.c > > +++ b/lib/tun-metadata.c > > @@ -25,6 +25,7 @@ > > #include "nx-match.h" > > #include "odp-netlink.h" > > #include "openvswitch/ofp-match.h" > > +#include "ovs-atomic.h" > > #include "ovs-rcu.h" > > #include "packets.h" > > #include "tun-metadata.h" > > @@ -40,6 +41,11 @@ struct tun_meta_entry { > > /* Maps from TLV option class+type to positions in a struct tun_metadata's > > * 'opts' array. */ > > struct tun_table { > > + /* Struct tun_table can be referenced by struct frozen_state for a long > > + * time. This ref_cnt protects tun_table from being freed if it is still > > + * being used somewhere. */ > > + struct ovs_refcount ref_cnt; > > + > > /* TUN_METADATA<i> is stored in element <i>. */ > > struct tun_meta_entry entries[TUN_METADATA_NUM_OPTS]; > > > > @@ -79,6 +85,24 @@ tun_key_type(uint32_t key) > > return key & 0xff; > > } > > > > +void > > +tun_metadata_ref(const struct tun_table *tab) > > +{ > > + if (tab) { > > + ovs_refcount_ref(&CONST_CAST(struct tun_table *, tab)->ref_cnt); > > + } > > +} > > + > > +unsigned int > > +tun_metadata_unref(const struct tun_table *tab) > > +{ > > + if (tab) { > > + return ovs_refcount_unref_relaxed( > > + &CONST_CAST(struct tun_table *, tab)->ref_cnt); > In general, xxx_unref will free the struct data, for example, > stp_unref/netdev_unref/lldp_unref. > > > + } > > + return -1; > > +} > > + > > /* Returns a newly allocated tun_table. If 'old_map' is nonnull then the new > > * tun_table is a deep copy of the old one. */ > > struct tun_table * > > @@ -111,6 +135,7 @@ tun_metadata_alloc(const struct tun_table *old_map) > > hmap_init(&new_map->key_hmap); > > } > > > > + ovs_refcount_init(&new_map->ref_cnt); > > return new_map; > > } > > > > @@ -135,7 +160,9 @@ tun_metadata_free(struct tun_table *map) > > void > > tun_metadata_postpone_free(struct tun_table *tab) > > { > > - ovsrcu_postpone(tun_metadata_free, tab); > > + if (tun_metadata_unref(tab) == 1) { > > + ovsrcu_postpone(tun_metadata_free, tab); > > + } > > } > > > > enum ofperr > > diff --git a/lib/tun-metadata.h b/lib/tun-metadata.h > > index 7dad9504b8da..933021a0f679 100644 > > --- a/lib/tun-metadata.h > > +++ b/lib/tun-metadata.h > > @@ -33,6 +33,8 @@ struct ofputil_tlv_table_mod; > > struct ofputil_tlv_table_reply; > > struct tun_table; > > > > +void tun_metadata_ref(const struct tun_table *tab); > > +unsigned int tun_metadata_unref(const struct tun_table *tab); > > struct tun_table *tun_metadata_alloc(const struct tun_table *old_map); > > void tun_metadata_free(struct tun_table *); > > void tun_metadata_postpone_free(struct tun_table *); > > diff --git a/ofproto/ofproto-dpif-rid.c b/ofproto/ofproto-dpif-rid.c > > index 29aafc2c0b40..d479e53d9b2d 100644 > > --- a/ofproto/ofproto-dpif-rid.c > > +++ b/ofproto/ofproto-dpif-rid.c > > @@ -201,6 +201,7 @@ static void > > frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) > > { > > *new = *old; > > + tun_metadata_ref(old->metadata.tunnel.metadata.tab); > > new->stack = (new->stack_size > > ? xmemdup(new->stack, new->stack_size) > > : NULL); > > @@ -218,10 +219,17 @@ frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) > > static void > > frozen_state_free(struct frozen_state *state) > > { > > + struct tun_table *tab; > > + > > free(state->stack); > > free(state->ofpacts); > > free(state->action_set); > > free(state->userdata); > > + > > + tab = CONST_CAST(struct tun_table *, state->metadata.tunnel.metadata.tab); > > + if (tun_metadata_unref(tab) == 1) { > > + tun_metadata_free(tab); > > + } > > } > > > > /* Allocate a unique recirculation id for the given set of flow metadata. > > diff --git a/ofproto/ofproto-dpif-rid.h b/ofproto/ofproto-dpif-rid.h > > index e5d02caf28a3..1fefbf53b94a 100644 > > --- a/ofproto/ofproto-dpif-rid.h > > +++ b/ofproto/ofproto-dpif-rid.h > > @@ -115,6 +115,7 @@ frozen_metadata_from_flow(struct frozen_metadata *md, > > { > > memset(md, 0, sizeof *md); > > md->tunnel = flow->tunnel; > > + tun_metadata_ref(flow->tunnel.metadata.tab); > should we ref it fristly, and then assign it. > > > md->metadata = flow->metadata; > > memcpy(md->regs, flow->regs, sizeof md->regs); > > md->in_port = flow->in_port.ofp_port; > > @@ -125,6 +126,7 @@ frozen_metadata_to_flow(const struct frozen_metadata *md, > > struct flow *flow) > > { > > flow->tunnel = md->tunnel; > > + tun_metadata_ref(md->tunnel.metadata.tab); > > flow->metadata = md->metadata; > > memcpy(flow->regs, md->regs, sizeof flow->regs); > > flow->in_port.ofp_port = md->in_port; > > -- > > 2.7.4 > > > > _______________________________________________ > > dev mailing list > > dev@openvswitch.org > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > > > > -- > Best regards, Tonghao
On Thu, Mar 26, 2020 at 12:58:21PM -0700, Yifeng Sun wrote: > Tun_metadata can be referened by flow and frozen_state at the same > time. When ovs-vswitchd handles TLV table mod message, the involved > tun_metadata gets freed. The call trace to free tun_metadata is > shown as below: > > ofproto_run > - handle_openflow > - handle_single_part_openflow > - handle_tlv_table_mod > - tun_metadata_table_mod > - tun_metadata_postpone_free > > Unfortunately, this tun_metadata can be still used by some frozen_state, > and later on when frozen_state tries to access its tun_metadata table, > ovs-vswitchd crashes. The call trace to access tun_metadata from > frozen_state is shown as below: > > udpif_upcall_handler > - recv_upcalls > - process_upcall > - frozen_metadata_to_flow > > This patch fixes it by introducing a reference count to tun_metadata. > Whenever a pointer of tun_metadata is passed between flow and > frozen_state, we increase its reference count. Reference count > is decreased at deallocation. > > In present code, pointer of tun_metadata can be passed between flows. > It is safe because of RCU mechanism. > > VMware-BZ: #2526222 > Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com> > --- > lib/tun-metadata.c | 29 ++++++++++++++++++++++++++++- > lib/tun-metadata.h | 2 ++ > ofproto/ofproto-dpif-rid.c | 8 ++++++++ > ofproto/ofproto-dpif-rid.h | 2 ++ > 4 files changed, 40 insertions(+), 1 deletion(-) > > diff --git a/lib/tun-metadata.c b/lib/tun-metadata.c > index f8a0e19524e9..c4218a034a92 100644 > --- a/lib/tun-metadata.c > +++ b/lib/tun-metadata.c > @@ -25,6 +25,7 @@ > #include "nx-match.h" > #include "odp-netlink.h" > #include "openvswitch/ofp-match.h" > +#include "ovs-atomic.h" > #include "ovs-rcu.h" > #include "packets.h" > #include "tun-metadata.h" > @@ -40,6 +41,11 @@ struct tun_meta_entry { > /* Maps from TLV option class+type to positions in a struct tun_metadata's > * 'opts' array. */ > struct tun_table { > + /* Struct tun_table can be referenced by struct frozen_state for a long > + * time. This ref_cnt protects tun_table from being freed if it is still > + * being used somewhere. */ > + struct ovs_refcount ref_cnt; > + > /* TUN_METADATA<i> is stored in element <i>. */ > struct tun_meta_entry entries[TUN_METADATA_NUM_OPTS]; > > @@ -79,6 +85,24 @@ tun_key_type(uint32_t key) > return key & 0xff; > } > > +void > +tun_metadata_ref(const struct tun_table *tab) > +{ > + if (tab) { > + ovs_refcount_ref(&CONST_CAST(struct tun_table *, tab)->ref_cnt); > + } > +} > + > +unsigned int > +tun_metadata_unref(const struct tun_table *tab) > +{ > + if (tab) { > + return ovs_refcount_unref_relaxed( > + &CONST_CAST(struct tun_table *, tab)->ref_cnt); > + } > + return -1; return -1 looks weird since it's unsigned int. > +} > + > /* Returns a newly allocated tun_table. If 'old_map' is nonnull then the new > * tun_table is a deep copy of the old one. */ > struct tun_table * > @@ -111,6 +135,7 @@ tun_metadata_alloc(const struct tun_table *old_map) > hmap_init(&new_map->key_hmap); > } > > + ovs_refcount_init(&new_map->ref_cnt); > return new_map; > } > > @@ -135,7 +160,9 @@ tun_metadata_free(struct tun_table *map) > void > tun_metadata_postpone_free(struct tun_table *tab) > { > - ovsrcu_postpone(tun_metadata_free, tab); > + if (tun_metadata_unref(tab) == 1) { > + ovsrcu_postpone(tun_metadata_free, tab); If this is the last ref and "tun_metadata_unref(tab)== 1" why not just call tun_metadata_free(tab)? Regards, William > + } > } > > enum ofperr > diff --git a/lib/tun-metadata.h b/lib/tun-metadata.h > index 7dad9504b8da..933021a0f679 100644 > --- a/lib/tun-metadata.h > +++ b/lib/tun-metadata.h > @@ -33,6 +33,8 @@ struct ofputil_tlv_table_mod; > struct ofputil_tlv_table_reply; > struct tun_table; > > +void tun_metadata_ref(const struct tun_table *tab); > +unsigned int tun_metadata_unref(const struct tun_table *tab); > struct tun_table *tun_metadata_alloc(const struct tun_table *old_map); > void tun_metadata_free(struct tun_table *); > void tun_metadata_postpone_free(struct tun_table *); > diff --git a/ofproto/ofproto-dpif-rid.c b/ofproto/ofproto-dpif-rid.c > index 29aafc2c0b40..d479e53d9b2d 100644 > --- a/ofproto/ofproto-dpif-rid.c > +++ b/ofproto/ofproto-dpif-rid.c > @@ -201,6 +201,7 @@ static void > frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) > { > *new = *old; > + tun_metadata_ref(old->metadata.tunnel.metadata.tab); > new->stack = (new->stack_size > ? xmemdup(new->stack, new->stack_size) > : NULL); > @@ -218,10 +219,17 @@ frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) > static void > frozen_state_free(struct frozen_state *state) > { > + struct tun_table *tab; > + > free(state->stack); > free(state->ofpacts); > free(state->action_set); > free(state->userdata); > + > + tab = CONST_CAST(struct tun_table *, state->metadata.tunnel.metadata.tab); > + if (tun_metadata_unref(tab) == 1) { > + tun_metadata_free(tab); > + } > } > > /* Allocate a unique recirculation id for the given set of flow metadata. > diff --git a/ofproto/ofproto-dpif-rid.h b/ofproto/ofproto-dpif-rid.h > index e5d02caf28a3..1fefbf53b94a 100644 > --- a/ofproto/ofproto-dpif-rid.h > +++ b/ofproto/ofproto-dpif-rid.h > @@ -115,6 +115,7 @@ frozen_metadata_from_flow(struct frozen_metadata *md, > { > memset(md, 0, sizeof *md); > md->tunnel = flow->tunnel; > + tun_metadata_ref(flow->tunnel.metadata.tab); > md->metadata = flow->metadata; > memcpy(md->regs, flow->regs, sizeof md->regs); > md->in_port = flow->in_port.ofp_port; > @@ -125,6 +126,7 @@ frozen_metadata_to_flow(const struct frozen_metadata *md, > struct flow *flow) > { > flow->tunnel = md->tunnel; > + tun_metadata_ref(md->tunnel.metadata.tab); > flow->metadata = md->metadata; > memcpy(flow->regs, md->regs, sizeof flow->regs); > flow->in_port.ofp_port = md->in_port; > -- > 2.7.4 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
On Mon, Apr 6, 2020 at 8:47 AM William Tu <u9012063@gmail.com> wrote: > > On Thu, Mar 26, 2020 at 12:58:21PM -0700, Yifeng Sun wrote: > > Tun_metadata can be referened by flow and frozen_state at the same > > time. When ovs-vswitchd handles TLV table mod message, the involved > > tun_metadata gets freed. The call trace to free tun_metadata is > > shown as below: > > > > ofproto_run > > - handle_openflow > > - handle_single_part_openflow > > - handle_tlv_table_mod > > - tun_metadata_table_mod > > - tun_metadata_postpone_free > > > > Unfortunately, this tun_metadata can be still used by some frozen_state, > > and later on when frozen_state tries to access its tun_metadata table, > > ovs-vswitchd crashes. The call trace to access tun_metadata from > > frozen_state is shown as below: > > > > udpif_upcall_handler > > - recv_upcalls > > - process_upcall > > - frozen_metadata_to_flow > > > > This patch fixes it by introducing a reference count to tun_metadata. > > Whenever a pointer of tun_metadata is passed between flow and > > frozen_state, we increase its reference count. Reference count > > is decreased at deallocation. > > > > In present code, pointer of tun_metadata can be passed between flows. > > It is safe because of RCU mechanism. > > > > VMware-BZ: #2526222 > > Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com> > > --- Hi Yifeng, Instead of introducing the ref count, how about just using the latest tun_tab? The patch below also fixes the use-after-free coredump. diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 8dfa05b71df4..9046f7d79952 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -1535,6 +1535,7 @@ process_upcall(struct udpif *udpif, struct upcall *upcall, } frozen_metadata_to_flow(&state->metadata, &frozen_flow); + frozen_flow.tunnel.metadata.tab = ofproto_get_tun_tab(&upcall->ofproto->up); flow_get_metadata(&frozen_flow, &am->pin.up.base.flow_metadata); ofproto_dpif_send_async_msg(upcall->ofproto, am);
Thanks William for the review. We need to use ovsrcu_postpone(tun_metadata_free, tab) because tun_table is protected by RCU and we can only free tun_table when all threads quiesce. It turns out that this fix based on reference count is not beautiful, it doesn't fit well with current RCU mechanism. A previous commit seems fixing the samiliar issue: 254878c18874f6 (ofproto-dpif-xlate: Fix segmentation fault caused by tun_table) It is unsafe for frozen_state to reference tun_table because tun_table is protected by RCU while the lifecycle of frozen_state can span several RCU quiesce states. As we discussed offline, we can simply nullify tun_table in frozen_state. If frozen_state needs tun_table, the latest valid tun_table can be found by ofproto_get_tun_tab() efficiently. I will submit a new version. Yifeng On Mon, Apr 6, 2020 at 8:47 AM William Tu <u9012063@gmail.com> wrote: > > On Thu, Mar 26, 2020 at 12:58:21PM -0700, Yifeng Sun wrote: > > Tun_metadata can be referened by flow and frozen_state at the same > > time. When ovs-vswitchd handles TLV table mod message, the involved > > tun_metadata gets freed. The call trace to free tun_metadata is > > shown as below: > > > > ofproto_run > > - handle_openflow > > - handle_single_part_openflow > > - handle_tlv_table_mod > > - tun_metadata_table_mod > > - tun_metadata_postpone_free > > > > Unfortunately, this tun_metadata can be still used by some frozen_state, > > and later on when frozen_state tries to access its tun_metadata table, > > ovs-vswitchd crashes. The call trace to access tun_metadata from > > frozen_state is shown as below: > > > > udpif_upcall_handler > > - recv_upcalls > > - process_upcall > > - frozen_metadata_to_flow > > > > This patch fixes it by introducing a reference count to tun_metadata. > > Whenever a pointer of tun_metadata is passed between flow and > > frozen_state, we increase its reference count. Reference count > > is decreased at deallocation. > > > > In present code, pointer of tun_metadata can be passed between flows. > > It is safe because of RCU mechanism. > > > > VMware-BZ: #2526222 > > Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com> > > --- > > lib/tun-metadata.c | 29 ++++++++++++++++++++++++++++- > > lib/tun-metadata.h | 2 ++ > > ofproto/ofproto-dpif-rid.c | 8 ++++++++ > > ofproto/ofproto-dpif-rid.h | 2 ++ > > 4 files changed, 40 insertions(+), 1 deletion(-) > > > > diff --git a/lib/tun-metadata.c b/lib/tun-metadata.c > > index f8a0e19524e9..c4218a034a92 100644 > > --- a/lib/tun-metadata.c > > +++ b/lib/tun-metadata.c > > @@ -25,6 +25,7 @@ > > #include "nx-match.h" > > #include "odp-netlink.h" > > #include "openvswitch/ofp-match.h" > > +#include "ovs-atomic.h" > > #include "ovs-rcu.h" > > #include "packets.h" > > #include "tun-metadata.h" > > @@ -40,6 +41,11 @@ struct tun_meta_entry { > > /* Maps from TLV option class+type to positions in a struct tun_metadata's > > * 'opts' array. */ > > struct tun_table { > > + /* Struct tun_table can be referenced by struct frozen_state for a long > > + * time. This ref_cnt protects tun_table from being freed if it is still > > + * being used somewhere. */ > > + struct ovs_refcount ref_cnt; > > + > > /* TUN_METADATA<i> is stored in element <i>. */ > > struct tun_meta_entry entries[TUN_METADATA_NUM_OPTS]; > > > > @@ -79,6 +85,24 @@ tun_key_type(uint32_t key) > > return key & 0xff; > > } > > > > +void > > +tun_metadata_ref(const struct tun_table *tab) > > +{ > > + if (tab) { > > + ovs_refcount_ref(&CONST_CAST(struct tun_table *, tab)->ref_cnt); > > + } > > +} > > + > > +unsigned int > > +tun_metadata_unref(const struct tun_table *tab) > > +{ > > + if (tab) { > > + return ovs_refcount_unref_relaxed( > > + &CONST_CAST(struct tun_table *, tab)->ref_cnt); > > + } > > + return -1; > return -1 looks weird since it's unsigned int. > > > +} > > + > > /* Returns a newly allocated tun_table. If 'old_map' is nonnull then the new > > * tun_table is a deep copy of the old one. */ > > struct tun_table * > > @@ -111,6 +135,7 @@ tun_metadata_alloc(const struct tun_table *old_map) > > hmap_init(&new_map->key_hmap); > > } > > > > + ovs_refcount_init(&new_map->ref_cnt); > > return new_map; > > } > > > > @@ -135,7 +160,9 @@ tun_metadata_free(struct tun_table *map) > > void > > tun_metadata_postpone_free(struct tun_table *tab) > > { > > - ovsrcu_postpone(tun_metadata_free, tab); > > + if (tun_metadata_unref(tab) == 1) { > > + ovsrcu_postpone(tun_metadata_free, tab); > > If this is the last ref and "tun_metadata_unref(tab)== 1" > why not just call tun_metadata_free(tab)? > > Regards, > William > > > + } > > } > > > > enum ofperr > > diff --git a/lib/tun-metadata.h b/lib/tun-metadata.h > > index 7dad9504b8da..933021a0f679 100644 > > --- a/lib/tun-metadata.h > > +++ b/lib/tun-metadata.h > > @@ -33,6 +33,8 @@ struct ofputil_tlv_table_mod; > > struct ofputil_tlv_table_reply; > > struct tun_table; > > > > +void tun_metadata_ref(const struct tun_table *tab); > > +unsigned int tun_metadata_unref(const struct tun_table *tab); > > struct tun_table *tun_metadata_alloc(const struct tun_table *old_map); > > void tun_metadata_free(struct tun_table *); > > void tun_metadata_postpone_free(struct tun_table *); > > diff --git a/ofproto/ofproto-dpif-rid.c b/ofproto/ofproto-dpif-rid.c > > index 29aafc2c0b40..d479e53d9b2d 100644 > > --- a/ofproto/ofproto-dpif-rid.c > > +++ b/ofproto/ofproto-dpif-rid.c > > @@ -201,6 +201,7 @@ static void > > frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) > > { > > *new = *old; > > + tun_metadata_ref(old->metadata.tunnel.metadata.tab); > > new->stack = (new->stack_size > > ? xmemdup(new->stack, new->stack_size) > > : NULL); > > @@ -218,10 +219,17 @@ frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) > > static void > > frozen_state_free(struct frozen_state *state) > > { > > + struct tun_table *tab; > > + > > free(state->stack); > > free(state->ofpacts); > > free(state->action_set); > > free(state->userdata); > > + > > + tab = CONST_CAST(struct tun_table *, state->metadata.tunnel.metadata.tab); > > + if (tun_metadata_unref(tab) == 1) { > > + tun_metadata_free(tab); > > + } > > } > > > > /* Allocate a unique recirculation id for the given set of flow metadata. > > diff --git a/ofproto/ofproto-dpif-rid.h b/ofproto/ofproto-dpif-rid.h > > index e5d02caf28a3..1fefbf53b94a 100644 > > --- a/ofproto/ofproto-dpif-rid.h > > +++ b/ofproto/ofproto-dpif-rid.h > > @@ -115,6 +115,7 @@ frozen_metadata_from_flow(struct frozen_metadata *md, > > { > > memset(md, 0, sizeof *md); > > md->tunnel = flow->tunnel; > > + tun_metadata_ref(flow->tunnel.metadata.tab); > > md->metadata = flow->metadata; > > memcpy(md->regs, flow->regs, sizeof md->regs); > > md->in_port = flow->in_port.ofp_port; > > @@ -125,6 +126,7 @@ frozen_metadata_to_flow(const struct frozen_metadata *md, > > struct flow *flow) > > { > > flow->tunnel = md->tunnel; > > + tun_metadata_ref(md->tunnel.metadata.tab); > > flow->metadata = md->metadata; > > memcpy(flow->regs, md->regs, sizeof flow->regs); > > flow->in_port.ofp_port = md->in_port; > > -- > > 2.7.4 > > > > _______________________________________________ > > dev mailing list > > dev@openvswitch.org > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
diff --git a/lib/tun-metadata.c b/lib/tun-metadata.c index f8a0e19524e9..c4218a034a92 100644 --- a/lib/tun-metadata.c +++ b/lib/tun-metadata.c @@ -25,6 +25,7 @@ #include "nx-match.h" #include "odp-netlink.h" #include "openvswitch/ofp-match.h" +#include "ovs-atomic.h" #include "ovs-rcu.h" #include "packets.h" #include "tun-metadata.h" @@ -40,6 +41,11 @@ struct tun_meta_entry { /* Maps from TLV option class+type to positions in a struct tun_metadata's * 'opts' array. */ struct tun_table { + /* Struct tun_table can be referenced by struct frozen_state for a long + * time. This ref_cnt protects tun_table from being freed if it is still + * being used somewhere. */ + struct ovs_refcount ref_cnt; + /* TUN_METADATA<i> is stored in element <i>. */ struct tun_meta_entry entries[TUN_METADATA_NUM_OPTS]; @@ -79,6 +85,24 @@ tun_key_type(uint32_t key) return key & 0xff; } +void +tun_metadata_ref(const struct tun_table *tab) +{ + if (tab) { + ovs_refcount_ref(&CONST_CAST(struct tun_table *, tab)->ref_cnt); + } +} + +unsigned int +tun_metadata_unref(const struct tun_table *tab) +{ + if (tab) { + return ovs_refcount_unref_relaxed( + &CONST_CAST(struct tun_table *, tab)->ref_cnt); + } + return -1; +} + /* Returns a newly allocated tun_table. If 'old_map' is nonnull then the new * tun_table is a deep copy of the old one. */ struct tun_table * @@ -111,6 +135,7 @@ tun_metadata_alloc(const struct tun_table *old_map) hmap_init(&new_map->key_hmap); } + ovs_refcount_init(&new_map->ref_cnt); return new_map; } @@ -135,7 +160,9 @@ tun_metadata_free(struct tun_table *map) void tun_metadata_postpone_free(struct tun_table *tab) { - ovsrcu_postpone(tun_metadata_free, tab); + if (tun_metadata_unref(tab) == 1) { + ovsrcu_postpone(tun_metadata_free, tab); + } } enum ofperr diff --git a/lib/tun-metadata.h b/lib/tun-metadata.h index 7dad9504b8da..933021a0f679 100644 --- a/lib/tun-metadata.h +++ b/lib/tun-metadata.h @@ -33,6 +33,8 @@ struct ofputil_tlv_table_mod; struct ofputil_tlv_table_reply; struct tun_table; +void tun_metadata_ref(const struct tun_table *tab); +unsigned int tun_metadata_unref(const struct tun_table *tab); struct tun_table *tun_metadata_alloc(const struct tun_table *old_map); void tun_metadata_free(struct tun_table *); void tun_metadata_postpone_free(struct tun_table *); diff --git a/ofproto/ofproto-dpif-rid.c b/ofproto/ofproto-dpif-rid.c index 29aafc2c0b40..d479e53d9b2d 100644 --- a/ofproto/ofproto-dpif-rid.c +++ b/ofproto/ofproto-dpif-rid.c @@ -201,6 +201,7 @@ static void frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) { *new = *old; + tun_metadata_ref(old->metadata.tunnel.metadata.tab); new->stack = (new->stack_size ? xmemdup(new->stack, new->stack_size) : NULL); @@ -218,10 +219,17 @@ frozen_state_clone(struct frozen_state *new, const struct frozen_state *old) static void frozen_state_free(struct frozen_state *state) { + struct tun_table *tab; + free(state->stack); free(state->ofpacts); free(state->action_set); free(state->userdata); + + tab = CONST_CAST(struct tun_table *, state->metadata.tunnel.metadata.tab); + if (tun_metadata_unref(tab) == 1) { + tun_metadata_free(tab); + } } /* Allocate a unique recirculation id for the given set of flow metadata. diff --git a/ofproto/ofproto-dpif-rid.h b/ofproto/ofproto-dpif-rid.h index e5d02caf28a3..1fefbf53b94a 100644 --- a/ofproto/ofproto-dpif-rid.h +++ b/ofproto/ofproto-dpif-rid.h @@ -115,6 +115,7 @@ frozen_metadata_from_flow(struct frozen_metadata *md, { memset(md, 0, sizeof *md); md->tunnel = flow->tunnel; + tun_metadata_ref(flow->tunnel.metadata.tab); md->metadata = flow->metadata; memcpy(md->regs, flow->regs, sizeof md->regs); md->in_port = flow->in_port.ofp_port; @@ -125,6 +126,7 @@ frozen_metadata_to_flow(const struct frozen_metadata *md, struct flow *flow) { flow->tunnel = md->tunnel; + tun_metadata_ref(md->tunnel.metadata.tab); flow->metadata = md->metadata; memcpy(flow->regs, md->regs, sizeof flow->regs); flow->in_port.ofp_port = md->in_port;
Tun_metadata can be referened by flow and frozen_state at the same time. When ovs-vswitchd handles TLV table mod message, the involved tun_metadata gets freed. The call trace to free tun_metadata is shown as below: ofproto_run - handle_openflow - handle_single_part_openflow - handle_tlv_table_mod - tun_metadata_table_mod - tun_metadata_postpone_free Unfortunately, this tun_metadata can be still used by some frozen_state, and later on when frozen_state tries to access its tun_metadata table, ovs-vswitchd crashes. The call trace to access tun_metadata from frozen_state is shown as below: udpif_upcall_handler - recv_upcalls - process_upcall - frozen_metadata_to_flow This patch fixes it by introducing a reference count to tun_metadata. Whenever a pointer of tun_metadata is passed between flow and frozen_state, we increase its reference count. Reference count is decreased at deallocation. In present code, pointer of tun_metadata can be passed between flows. It is safe because of RCU mechanism. VMware-BZ: #2526222 Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com> --- lib/tun-metadata.c | 29 ++++++++++++++++++++++++++++- lib/tun-metadata.h | 2 ++ ofproto/ofproto-dpif-rid.c | 8 ++++++++ ofproto/ofproto-dpif-rid.h | 2 ++ 4 files changed, 40 insertions(+), 1 deletion(-)