diff mbox

[ovs-dev,2/3] bond: fix memory leak and invalid memory write

Message ID 1487500887-29118-2-git-send-email-hanxueluo@126.com
State Accepted
Headers show

Commit Message

hanxueluo@126.com Feb. 19, 2017, 10:41 a.m. UTC
From: Huanle Han <hanxueluo@gmail.com>

The recirc rule_op of tcp-balance bond is still referenced
though its memory is freed. And the freed object will be
written invalidly on following bond_update_post_recirc_rules.
The commit releases the reference to the rule_ops when
tcp-balance bond destroy or balance type change.

Thread 23 handler69:
Invalid write of size 8
   update_recirc_rules (bond.c:385)
   bond_update_post_recirc_rules__ (bond.c:952)
   bond_update_post_recirc_rules (bond.c:960)
   output_normal (ofproto-dpif-xlate.c:2102)
   xlate_normal (ofproto-dpif-xlate.c:2858)
   xlate_output_action (ofproto-dpif-xlate.c:4407)
   do_xlate_actions (ofproto-dpif-xlate.c:5335)
   xlate_actions (ofproto-dpif-xlate.c:6198)
   upcall_xlate (ofproto-dpif-upcall.c:1129)
   process_upcall (ofproto-dpif-upcall.c:1271)
   recv_upcalls (ofproto-dpif-upcall.c:822)
   udpif_upcall_handler (ofproto-dpif-upcall.c:740)
 Address 0x18630490 is 1,904 bytes inside a block of size 12,288 free'd
    free (vg_replace_malloc.c:529)
   bond_entry_reset (bond.c:1635)
   bond_reconfigure (bond.c:457)
   bundle_set (ofproto-dpif.c:2896)
   ofproto_bundle_register (ofproto.c:1343)
   port_configure (bridge.c:1159)
   bridge_reconfigure (bridge.c:785)
   bridge_run (bridge.c:3099)
   main (ovs-vswitchd.c:111)
 Block was alloc'd at
    malloc (vg_replace_malloc.c:298)
   xmalloc (util.c:110)
   bond_entry_reset (bond.c:1629)
   bond_reconfigure (bond.c:457)
   bond_create (bond.c:245)
   bundle_set (ofproto-dpif.c:2900)
   ofproto_bundle_register (ofproto.c:1343)
   port_configure (bridge.c:1159)
   bridge_reconfigure (bridge.c:785)
   bridge_run (bridge.c:3099)
   main (ovs-vswitchd.c:111)

Signed-off-by: Huanle Han <hanxueluo@gmail.com>
---
 ofproto/bond.c | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

Comments

Andy Zhou Feb. 23, 2017, 9:43 p.m. UTC | #1
On Sun, Feb 19, 2017 at 2:41 AM,  <hanxueluo@126.com> wrote:
> From: Huanle Han <hanxueluo@gmail.com>
>
> The recirc rule_op of tcp-balance bond is still referenced
> though its memory is freed. And the freed object will be
> written invalidly on following bond_update_post_recirc_rules.
> The commit releases the reference to the rule_ops when
> tcp-balance bond destroy or balance type change.
>
Thanks for the bug report and for working on it!

> Thread 23 handler69:
> Invalid write of size 8
>    update_recirc_rules (bond.c:385)
>    bond_update_post_recirc_rules__ (bond.c:952)
>    bond_update_post_recirc_rules (bond.c:960)
>    output_normal (ofproto-dpif-xlate.c:2102)
>    xlate_normal (ofproto-dpif-xlate.c:2858)
>    xlate_output_action (ofproto-dpif-xlate.c:4407)
>    do_xlate_actions (ofproto-dpif-xlate.c:5335)
>    xlate_actions (ofproto-dpif-xlate.c:6198)
>    upcall_xlate (ofproto-dpif-upcall.c:1129)
>    process_upcall (ofproto-dpif-upcall.c:1271)
>    recv_upcalls (ofproto-dpif-upcall.c:822)
>    udpif_upcall_handler (ofproto-dpif-upcall.c:740)
>  Address 0x18630490 is 1,904 bytes inside a block of size 12,288 free'd
>     free (vg_replace_malloc.c:529)
>    bond_entry_reset (bond.c:1635)
>    bond_reconfigure (bond.c:457)
>    bundle_set (ofproto-dpif.c:2896)
>    ofproto_bundle_register (ofproto.c:1343)
>    port_configure (bridge.c:1159)
>    bridge_reconfigure (bridge.c:785)
>    bridge_run (bridge.c:3099)
>    main (ovs-vswitchd.c:111)
>  Block was alloc'd at
>     malloc (vg_replace_malloc.c:298)
>    xmalloc (util.c:110)
>    bond_entry_reset (bond.c:1629)
>    bond_reconfigure (bond.c:457)
>    bond_create (bond.c:245)
>    bundle_set (ofproto-dpif.c:2900)
>    ofproto_bundle_register (ofproto.c:1343)
>    port_configure (bridge.c:1159)
>    bridge_reconfigure (bridge.c:785)
>    bridge_run (bridge.c:3099)
>    main (ovs-vswitchd.c:111)

It is not clear to me how the fix proposed by the patch explains the
cause of the backtrace.

To me, the backtrace suggests a race condition between revalidate
thread and the main thread.
I think I found the race condition, and proposed a fix at:

https://mail.openvswitch.org/pipermail/ovs-dev/2017-February/329073.html

>
> Signed-off-by: Huanle Han <hanxueluo@gmail.com>
> ---
>  ofproto/bond.c | 38 +++++++++++++++++++++++++++++++++-----
>  1 file changed, 33 insertions(+), 5 deletions(-)
>
> diff --git a/ofproto/bond.c b/ofproto/bond.c
> index 260023e..1c5ae43 100644
> --- a/ofproto/bond.c
> +++ b/ofproto/bond.c
> @@ -275,6 +275,22 @@ bond_unref(struct bond *bond)
>      hmap_remove(all_bonds, &bond->hmap_node);
>      ovs_rwlock_unlock(&rwlock);
>
> +    HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
> +        int error = ofproto_dpif_delete_internal_flow(bond->ofproto,
> +                &pr_op->match, RECIRC_RULE_PRIORITY);
> +        if (error) {
> +            char *err_s = match_to_string(&pr_op->match,
> +                    RECIRC_RULE_PRIORITY);
> +
> +            VLOG_ERR("failed to remove post recirculation flow %s when bond_unref", err_s);
> +            free(err_s);
> +        }
> +
> +        *pr_op->pr_rule = NULL;
> +        free(pr_op);
> +    }
> +    hmap_destroy(&bond->pr_rule_ops);
> +
>      HMAP_FOR_EACH_POP (slave, hmap_node, &bond->slaves) {
>          /* Client owns 'slave->netdev'. */
>          free(slave->name);
> @@ -284,13 +300,9 @@ bond_unref(struct bond *bond)
>
>      ovs_mutex_destroy(&bond->mutex);
>      free(bond->hash);
> +    bond->hash = NULL;
>      free(bond->name);
>
> -    HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
> -        free(pr_op);
> -    }
> -    hmap_destroy(&bond->pr_rule_ops);
> -
>      if (bond->recirc_id) {
>          recirc_free_id(bond->recirc_id);
>      }
> @@ -1635,6 +1647,22 @@ bond_entry_reset(struct bond *bond)
>
>          bond->next_rebalance = time_msec() + bond->rebalance_interval;
>      } else {
> +        struct bond_pr_rule_op *pr_op;
> +        HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
> +            int error = ofproto_dpif_delete_internal_flow(bond->ofproto,
> +                    &pr_op->match, RECIRC_RULE_PRIORITY);
> +            if (error) {
> +                char *err_s = match_to_string(&pr_op->match,
> +                        RECIRC_RULE_PRIORITY);
> +
> +                VLOG_ERR("failed to remove post recirculation flow %s when bond reset", err_s);
> +                free(err_s);
> +            }
> +
> +            hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
> +            *pr_op->pr_rule = NULL;
> +            free(pr_op);
> +        }
>          free(bond->hash);
>          bond->hash = NULL;
>      }
> --

The patch seems mostly about fixing post recirculation rule leak.
However, there are some
logic duplications, and it seems we can reuse the existing logics
update_recirc_rules().
How about the following patch instead:

https://mail.openvswitch.org/pipermail/ovs-dev/2017-February/329074.html
Huanle Han Feb. 24, 2017, 5:04 a.m. UTC | #2
Thanks, Andy. Your fix looks better than mine.
And, don't forget to apply the fix in ofproto_dpif_delete_internal_flow to
fix the post recirc rule leak. :)

2017年2月24日星期五,Andy Zhou <azhou@ovn.org> 写道:
> On Sun, Feb 19, 2017 at 2:41 AM,  <hanxueluo@126.com> wrote:
>> From: Huanle Han <hanxueluo@gmail.com>
>>
>> The recirc rule_op of tcp-balance bond is still referenced
>> though its memory is freed. And the freed object will be
>> written invalidly on following bond_update_post_recirc_rules.
>> The commit releases the reference to the rule_ops when
>> tcp-balance bond destroy or balance type change.
>>
> Thanks for the bug report and for working on it!
>
>> Thread 23 handler69:
>> Invalid write of size 8
>>    update_recirc_rules (bond.c:385)
>>    bond_update_post_recirc_rules__ (bond.c:952)
>>    bond_update_post_recirc_rules (bond.c:960)
>>    output_normal (ofproto-dpif-xlate.c:2102)
>>    xlate_normal (ofproto-dpif-xlate.c:2858)
>>    xlate_output_action (ofproto-dpif-xlate.c:4407)
>>    do_xlate_actions (ofproto-dpif-xlate.c:5335)
>>    xlate_actions (ofproto-dpif-xlate.c:6198)
>>    upcall_xlate (ofproto-dpif-upcall.c:1129)
>>    process_upcall (ofproto-dpif-upcall.c:1271)
>>    recv_upcalls (ofproto-dpif-upcall.c:822)
>>    udpif_upcall_handler (ofproto-dpif-upcall.c:740)
>>  Address 0x18630490 is 1,904 bytes inside a block of size 12,288 free'd
>>     free (vg_replace_malloc.c:529)
>>    bond_entry_reset (bond.c:1635)
>>    bond_reconfigure (bond.c:457)
>>    bundle_set (ofproto-dpif.c:2896)
>>    ofproto_bundle_register (ofproto.c:1343)
>>    port_configure (bridge.c:1159)
>>    bridge_reconfigure (bridge.c:785)
>>    bridge_run (bridge.c:3099)
>>    main (ovs-vswitchd.c:111)
>>  Block was alloc'd at
>>     malloc (vg_replace_malloc.c:298)
>>    xmalloc (util.c:110)
>>    bond_entry_reset (bond.c:1629)
>>    bond_reconfigure (bond.c:457)
>>    bond_create (bond.c:245)
>>    bundle_set (ofproto-dpif.c:2900)
>>    ofproto_bundle_register (ofproto.c:1343)
>>    port_configure (bridge.c:1159)
>>    bridge_reconfigure (bridge.c:785)
>>    bridge_run (bridge.c:3099)
>>    main (ovs-vswitchd.c:111)
>
> It is not clear to me how the fix proposed by the patch explains the
> cause of the backtrace.
>
> To me, the backtrace suggests a race condition between revalidate
> thread and the main thread.
> I think I found the race condition, and proposed a fix at:
>
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-February/329073.html
>
>>
>> Signed-off-by: Huanle Han <hanxueluo@gmail.com>
>> ---
>>  ofproto/bond.c | 38 +++++++++++++++++++++++++++++++++-----
>>  1 file changed, 33 insertions(+), 5 deletions(-)
>>
>> diff --git a/ofproto/bond.c b/ofproto/bond.c
>> index 260023e..1c5ae43 100644
>> --- a/ofproto/bond.c
>> +++ b/ofproto/bond.c
>> @@ -275,6 +275,22 @@ bond_unref(struct bond *bond)
>>      hmap_remove(all_bonds, &bond->hmap_node);
>>      ovs_rwlock_unlock(&rwlock);
>>
>> +    HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
>> +        int error = ofproto_dpif_delete_internal_flow(bond->ofproto,
>> +                &pr_op->match, RECIRC_RULE_PRIORITY);
>> +        if (error) {
>> +            char *err_s = match_to_string(&pr_op->match,
>> +                    RECIRC_RULE_PRIORITY);
>> +
>> +            VLOG_ERR("failed to remove post recirculation flow %s when
bond_unref", err_s);
>> +            free(err_s);
>> +        }
>> +
>> +        *pr_op->pr_rule = NULL;
>> +        free(pr_op);
>> +    }
>> +    hmap_destroy(&bond->pr_rule_ops);
>> +
>>      HMAP_FOR_EACH_POP (slave, hmap_node, &bond->slaves) {
>>          /* Client owns 'slave->netdev'. */
>>          free(slave->name);
>> @@ -284,13 +300,9 @@ bond_unref(struct bond *bond)
>>
>>      ovs_mutex_destroy(&bond->mutex);
>>      free(bond->hash);
>> +    bond->hash = NULL;
>>      free(bond->name);
>>
>> -    HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
>> -        free(pr_op);
>> -    }
>> -    hmap_destroy(&bond->pr_rule_ops);
>> -
>>      if (bond->recirc_id) {
>>          recirc_free_id(bond->recirc_id);
>>      }
>> @@ -1635,6 +1647,22 @@ bond_entry_reset(struct bond *bond)
>>
>>          bond->next_rebalance = time_msec() + bond->rebalance_interval;
>>      } else {
>> +        struct bond_pr_rule_op *pr_op;
>> +        HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
>> +            int error = ofproto_dpif_delete_internal_flow(bond->ofproto,
>> +                    &pr_op->match, RECIRC_RULE_PRIORITY);
>> +            if (error) {
>> +                char *err_s = match_to_string(&pr_op->match,
>> +                        RECIRC_RULE_PRIORITY);
>> +
>> +                VLOG_ERR("failed to remove post recirculation flow %s
when bond reset", err_s);
>> +                free(err_s);
>> +            }
>> +
>> +            hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
>> +            *pr_op->pr_rule = NULL;
>> +            free(pr_op);
>> +        }
>>          free(bond->hash);
>>          bond->hash = NULL;
>>      }
>> --
>
> The patch seems mostly about fixing post recirculation rule leak.
> However, there are some
> logic duplications, and it seems we can reuse the existing logics
> update_recirc_rules().
> How about the following patch instead:
>
> https://mail.openvswitch.org/pipermail/ovs-dev/2017-February/329074.html
>
Andy Zhou Feb. 24, 2017, 10:42 p.m. UTC | #3
On Thu, Feb 23, 2017 at 9:04 PM, Huanle Han <hanxueluo@gmail.com> wrote:
> Thanks, Andy. Your fix looks better than mine.

I took this as ACK, and added your name to Acked-by: Thanks for the review.

> And, don't forget to apply the fix in ofproto_dpif_delete_internal_flow to
> fix the post recirc rule leak. :)

Done.
diff mbox

Patch

diff --git a/ofproto/bond.c b/ofproto/bond.c
index 260023e..1c5ae43 100644
--- a/ofproto/bond.c
+++ b/ofproto/bond.c
@@ -275,6 +275,22 @@  bond_unref(struct bond *bond)
     hmap_remove(all_bonds, &bond->hmap_node);
     ovs_rwlock_unlock(&rwlock);
 
+    HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
+        int error = ofproto_dpif_delete_internal_flow(bond->ofproto,
+                &pr_op->match, RECIRC_RULE_PRIORITY);
+        if (error) {
+            char *err_s = match_to_string(&pr_op->match,
+                    RECIRC_RULE_PRIORITY);
+
+            VLOG_ERR("failed to remove post recirculation flow %s when bond_unref", err_s);
+            free(err_s);
+        }
+
+        *pr_op->pr_rule = NULL;
+        free(pr_op);
+    }
+    hmap_destroy(&bond->pr_rule_ops);
+
     HMAP_FOR_EACH_POP (slave, hmap_node, &bond->slaves) {
         /* Client owns 'slave->netdev'. */
         free(slave->name);
@@ -284,13 +300,9 @@  bond_unref(struct bond *bond)
 
     ovs_mutex_destroy(&bond->mutex);
     free(bond->hash);
+    bond->hash = NULL;
     free(bond->name);
 
-    HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
-        free(pr_op);
-    }
-    hmap_destroy(&bond->pr_rule_ops);
-
     if (bond->recirc_id) {
         recirc_free_id(bond->recirc_id);
     }
@@ -1635,6 +1647,22 @@  bond_entry_reset(struct bond *bond)
 
         bond->next_rebalance = time_msec() + bond->rebalance_interval;
     } else {
+        struct bond_pr_rule_op *pr_op;
+        HMAP_FOR_EACH_POP (pr_op, hmap_node, &bond->pr_rule_ops) {
+            int error = ofproto_dpif_delete_internal_flow(bond->ofproto,
+                    &pr_op->match, RECIRC_RULE_PRIORITY);
+            if (error) {
+                char *err_s = match_to_string(&pr_op->match,
+                        RECIRC_RULE_PRIORITY);
+
+                VLOG_ERR("failed to remove post recirculation flow %s when bond reset", err_s);
+                free(err_s);
+            }
+
+            hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node);
+            *pr_op->pr_rule = NULL;
+            free(pr_op);
+        }
         free(bond->hash);
         bond->hash = NULL;
     }