diff mbox

[net-next,v5,3/8] switchdev: allow caller to explicitly request attr_set as deferred

Message ID 1444844455-12508-4-git-send-email-jiri@resnulli.us
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Jiri Pirko Oct. 14, 2015, 5:40 p.m. UTC
From: Jiri Pirko <jiri@mellanox.com>

Caller should know if he can call attr_set directly (when holding RTNL)
or if he has to defer the att_set processing for later.

This also allows drivers to sleep inside attr_set and report operation
status back to switchdev core. Switchdev core then warns if status is
not ok, instead of silent errors happening in drivers.

Benefit from newly introduced switchdev deferred ops infrastructure.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/switchdev.h   |   1 +
 net/bridge/br_stp.c       |   3 +-
 net/switchdev/switchdev.c | 108 ++++++++++++++++++----------------------------
 3 files changed, 46 insertions(+), 66 deletions(-)

Comments

Scott Feldman Oct. 15, 2015, 4:34 a.m. UTC | #1
On Wed, Oct 14, 2015 at 10:40 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> From: Jiri Pirko <jiri@mellanox.com>
>
> Caller should know if he can call attr_set directly (when holding RTNL)
> or if he has to defer the att_set processing for later.
>
> This also allows drivers to sleep inside attr_set and report operation
> status back to switchdev core. Switchdev core then warns if status is
> not ok, instead of silent errors happening in drivers.
>
> Benefit from newly introduced switchdev deferred ops infrastructure.
>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/net/switchdev.h   |   1 +
>  net/bridge/br_stp.c       |   3 +-
>  net/switchdev/switchdev.c | 108 ++++++++++++++++++----------------------------
>  3 files changed, 46 insertions(+), 66 deletions(-)
>
> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> index d1c7f90..f7de6f8 100644
> --- a/include/net/switchdev.h
> +++ b/include/net/switchdev.h
> @@ -17,6 +17,7 @@
>
>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
> +#define SWITCHDEV_F_DEFER              BIT(2)
>
>  struct switchdev_trans_item {
>         struct list_head list;
> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
> index db6d243de..80c34d7 100644
> --- a/net/bridge/br_stp.c
> +++ b/net/bridge/br_stp.c
> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>  {
>         struct switchdev_attr attr = {
>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
> +               .flags = SWITCHDEV_F_DEFER,
>                 .u.stp_state = state,
>         };
>         int err;
>
>         p->state = state;
>         err = switchdev_port_attr_set(p->dev, &attr);
> -       if (err && err != -EOPNOTSUPP)
> +       if (err)
>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>                                 (unsigned int) p->port_no, p->dev->name);
>  }

Should this part of the patch be moved to patch 6/8 where
switchdev_deferred_process() is called from del_nbp()?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Oct. 15, 2015, 5:57 a.m. UTC | #2
Thu, Oct 15, 2015 at 06:34:01AM CEST, sfeldma@gmail.com wrote:
>On Wed, Oct 14, 2015 at 10:40 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>>
>> Caller should know if he can call attr_set directly (when holding RTNL)
>> or if he has to defer the att_set processing for later.
>>
>> This also allows drivers to sleep inside attr_set and report operation
>> status back to switchdev core. Switchdev core then warns if status is
>> not ok, instead of silent errors happening in drivers.
>>
>> Benefit from newly introduced switchdev deferred ops infrastructure.
>>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  include/net/switchdev.h   |   1 +
>>  net/bridge/br_stp.c       |   3 +-
>>  net/switchdev/switchdev.c | 108 ++++++++++++++++++----------------------------
>>  3 files changed, 46 insertions(+), 66 deletions(-)
>>
>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>> index d1c7f90..f7de6f8 100644
>> --- a/include/net/switchdev.h
>> +++ b/include/net/switchdev.h
>> @@ -17,6 +17,7 @@
>>
>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>
>>  struct switchdev_trans_item {
>>         struct list_head list;
>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>> index db6d243de..80c34d7 100644
>> --- a/net/bridge/br_stp.c
>> +++ b/net/bridge/br_stp.c
>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>  {
>>         struct switchdev_attr attr = {
>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>> +               .flags = SWITCHDEV_F_DEFER,
>>                 .u.stp_state = state,
>>         };
>>         int err;
>>
>>         p->state = state;
>>         err = switchdev_port_attr_set(p->dev, &attr);
>> -       if (err && err != -EOPNOTSUPP)
>> +       if (err)
>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>                                 (unsigned int) p->port_no, p->dev->name);
>>  }
>
>Should this part of the patch be moved to patch 6/8 where
>switchdev_deferred_process() is called from del_nbp()?

No, this part relates to the fact that attr_set now does not defer
automagically. So caller must say when to defer. So having this here is
correct.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 15, 2015, 3:21 p.m. UTC | #3
On 15-10-14 10:40 AM, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@mellanox.com>
> 
> Caller should know if he can call attr_set directly (when holding RTNL)
> or if he has to defer the att_set processing for later.
> 
> This also allows drivers to sleep inside attr_set and report operation
> status back to switchdev core. Switchdev core then warns if status is
> not ok, instead of silent errors happening in drivers.
> 
> Benefit from newly introduced switchdev deferred ops infrastructure.
> 

A nit but the patch description should note your setting the defer bit
on the bridge set state.

> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/net/switchdev.h   |   1 +
>  net/bridge/br_stp.c       |   3 +-
>  net/switchdev/switchdev.c | 108 ++++++++++++++++++----------------------------
>  3 files changed, 46 insertions(+), 66 deletions(-)
> 
> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> index d1c7f90..f7de6f8 100644
> --- a/include/net/switchdev.h
> +++ b/include/net/switchdev.h
> @@ -17,6 +17,7 @@
>  
>  #define SWITCHDEV_F_NO_RECURSE		BIT(0)
>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP	BIT(1)
> +#define SWITCHDEV_F_DEFER		BIT(2)
>  
>  struct switchdev_trans_item {
>  	struct list_head list;
> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
> index db6d243de..80c34d7 100644
> --- a/net/bridge/br_stp.c
> +++ b/net/bridge/br_stp.c
> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>  {
>  	struct switchdev_attr attr = {
>  		.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
> +		.flags = SWITCHDEV_F_DEFER,
>  		.u.stp_state = state,
>  	};


This creates a possible race (with 6/8) I think, please check!

In del_nbp() we call br_stp_disable_port() to set the port state
to BR_STATE_DISABLE and disabling learning events. But with this
patch it can be deferred. Also note the STP agent may be in userspace
which actually seems more likely the case because you likely want to
run some more modern variant of STP than the kernel supports.

So at some point in the future the driver will turn off learning. At
the same time we call br_fdb_delete_by_port which calls a deferred
set of fdb deletes.

I don't see how you guarantee learning is off before you start doing
the deletes here and possibly learning new addresses after the software
side believes the port is down.

So

   br_stp_disable_port
                           br_fdb_delete_by_port
                           {fdb_del_external_learn}
   [hw learns a fdb]
   [hw disables learning]

What stops this from happening?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Oct. 16, 2015, 8:23 a.m. UTC | #4
Thu, Oct 15, 2015 at 05:21:22PM CEST, john.fastabend@gmail.com wrote:
>On 15-10-14 10:40 AM, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> Caller should know if he can call attr_set directly (when holding RTNL)
>> or if he has to defer the att_set processing for later.
>> 
>> This also allows drivers to sleep inside attr_set and report operation
>> status back to switchdev core. Switchdev core then warns if status is
>> not ok, instead of silent errors happening in drivers.
>> 
>> Benefit from newly introduced switchdev deferred ops infrastructure.
>> 
>
>A nit but the patch description should note your setting the defer bit
>on the bridge set state.
>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  include/net/switchdev.h   |   1 +
>>  net/bridge/br_stp.c       |   3 +-
>>  net/switchdev/switchdev.c | 108 ++++++++++++++++++----------------------------
>>  3 files changed, 46 insertions(+), 66 deletions(-)
>> 
>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>> index d1c7f90..f7de6f8 100644
>> --- a/include/net/switchdev.h
>> +++ b/include/net/switchdev.h
>> @@ -17,6 +17,7 @@
>>  
>>  #define SWITCHDEV_F_NO_RECURSE		BIT(0)
>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP	BIT(1)
>> +#define SWITCHDEV_F_DEFER		BIT(2)
>>  
>>  struct switchdev_trans_item {
>>  	struct list_head list;
>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>> index db6d243de..80c34d7 100644
>> --- a/net/bridge/br_stp.c
>> +++ b/net/bridge/br_stp.c
>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>  {
>>  	struct switchdev_attr attr = {
>>  		.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>> +		.flags = SWITCHDEV_F_DEFER,
>>  		.u.stp_state = state,
>>  	};
>
>
>This creates a possible race (with 6/8) I think, please check!

Wait. This patch does not change the previous behaviour. Patch 6 does,
so I don't understand why you are asking here. Confusing.


>
>In del_nbp() we call br_stp_disable_port() to set the port state
>to BR_STATE_DISABLE and disabling learning events. But with this
>patch it can be deferred. Also note the STP agent may be in userspace
>which actually seems more likely the case because you likely want to
>run some more modern variant of STP than the kernel supports.
>
>So at some point in the future the driver will turn off learning. At
>the same time we call br_fdb_delete_by_port which calls a deferred
>set of fdb deletes.
>
>I don't see how you guarantee learning is off before you start doing
>the deletes here and possibly learning new addresses after the software
>side believes the port is down.
>
>So
>
>   br_stp_disable_port
>                           br_fdb_delete_by_port
>                           {fdb_del_external_learn}
>   [hw learns a fdb]
>   [hw disables learning]
>
>What stops this from happening?

Okay. This behaviour is the same as without the patchset. What would
resolve the issue it to put switchdev_deferred_process() after
br_stp_disable_port() and before br_fdb_delete_by_port() call.
That would enforce stp change to happen in hw before fdbs are explicitly
deleted. Sound good to you?



>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman Oct. 16, 2015, 4:20 p.m. UTC | #5
On Fri, Oct 16, 2015 at 1:23 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Thu, Oct 15, 2015 at 05:21:22PM CEST, john.fastabend@gmail.com wrote:
>>On 15-10-14 10:40 AM, Jiri Pirko wrote:
>>> From: Jiri Pirko <jiri@mellanox.com>
>>>
>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>> or if he has to defer the att_set processing for later.
>>>
>>> This also allows drivers to sleep inside attr_set and report operation
>>> status back to switchdev core. Switchdev core then warns if status is
>>> not ok, instead of silent errors happening in drivers.
>>>
>>> Benefit from newly introduced switchdev deferred ops infrastructure.
>>>
>>
>>A nit but the patch description should note your setting the defer bit
>>on the bridge set state.
>>
>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>> ---
>>>  include/net/switchdev.h   |   1 +
>>>  net/bridge/br_stp.c       |   3 +-
>>>  net/switchdev/switchdev.c | 108 ++++++++++++++++++----------------------------
>>>  3 files changed, 46 insertions(+), 66 deletions(-)
>>>
>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>> index d1c7f90..f7de6f8 100644
>>> --- a/include/net/switchdev.h
>>> +++ b/include/net/switchdev.h
>>> @@ -17,6 +17,7 @@
>>>
>>>  #define SWITCHDEV_F_NO_RECURSE              BIT(0)
>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP BIT(1)
>>> +#define SWITCHDEV_F_DEFER           BIT(2)
>>>
>>>  struct switchdev_trans_item {
>>>      struct list_head list;
>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>> index db6d243de..80c34d7 100644
>>> --- a/net/bridge/br_stp.c
>>> +++ b/net/bridge/br_stp.c
>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>  {
>>>      struct switchdev_attr attr = {
>>>              .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>> +            .flags = SWITCHDEV_F_DEFER,
>>>              .u.stp_state = state,
>>>      };
>>
>>
>>This creates a possible race (with 6/8) I think, please check!
>
> Wait. This patch does not change the previous behaviour. Patch 6 does,
> so I don't understand why you are asking here. Confusing.
>
>
>>
>>In del_nbp() we call br_stp_disable_port() to set the port state
>>to BR_STATE_DISABLE and disabling learning events. But with this
>>patch it can be deferred. Also note the STP agent may be in userspace
>>which actually seems more likely the case because you likely want to
>>run some more modern variant of STP than the kernel supports.
>>
>>So at some point in the future the driver will turn off learning. At
>>the same time we call br_fdb_delete_by_port which calls a deferred
>>set of fdb deletes.
>>
>>I don't see how you guarantee learning is off before you start doing
>>the deletes here and possibly learning new addresses after the software
>>side believes the port is down.
>>
>>So
>>
>>   br_stp_disable_port
>>                           br_fdb_delete_by_port
>>                           {fdb_del_external_learn}
>>   [hw learns a fdb]
>>   [hw disables learning]
>>
>>What stops this from happening?
>
> Okay. This behaviour is the same as without the patchset. What would
> resolve the issue it to put switchdev_deferred_process() after
> br_stp_disable_port() and before br_fdb_delete_by_port() call.
> That would enforce stp change to happen in hw before fdbs are explicitly
> deleted. Sound good to you?

Doesn't HW already see things in the right order since items are
dequeued from the deferred list in the order queued?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 16, 2015, 5:16 p.m. UTC | #6
On 15-10-16 09:20 AM, Scott Feldman wrote:
> On Fri, Oct 16, 2015 at 1:23 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Thu, Oct 15, 2015 at 05:21:22PM CEST, john.fastabend@gmail.com wrote:
>>> On 15-10-14 10:40 AM, Jiri Pirko wrote:
>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>
>>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>>> or if he has to defer the att_set processing for later.
>>>>
>>>> This also allows drivers to sleep inside attr_set and report operation
>>>> status back to switchdev core. Switchdev core then warns if status is
>>>> not ok, instead of silent errors happening in drivers.
>>>>
>>>> Benefit from newly introduced switchdev deferred ops infrastructure.
>>>>
>>>
>>> A nit but the patch description should note your setting the defer bit
>>> on the bridge set state.
>>>
>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>> ---
>>>>  include/net/switchdev.h   |   1 +
>>>>  net/bridge/br_stp.c       |   3 +-
>>>>  net/switchdev/switchdev.c | 108 ++++++++++++++++++----------------------------
>>>>  3 files changed, 46 insertions(+), 66 deletions(-)
>>>>
>>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>>> index d1c7f90..f7de6f8 100644
>>>> --- a/include/net/switchdev.h
>>>> +++ b/include/net/switchdev.h
>>>> @@ -17,6 +17,7 @@
>>>>
>>>>  #define SWITCHDEV_F_NO_RECURSE              BIT(0)
>>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP BIT(1)
>>>> +#define SWITCHDEV_F_DEFER           BIT(2)
>>>>
>>>>  struct switchdev_trans_item {
>>>>      struct list_head list;
>>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>>> index db6d243de..80c34d7 100644
>>>> --- a/net/bridge/br_stp.c
>>>> +++ b/net/bridge/br_stp.c
>>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>>  {
>>>>      struct switchdev_attr attr = {
>>>>              .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>>> +            .flags = SWITCHDEV_F_DEFER,
>>>>              .u.stp_state = state,
>>>>      };
>>>
>>>
>>> This creates a possible race (with 6/8) I think, please check!
>>
>> Wait. This patch does not change the previous behaviour. Patch 6 does,
>> so I don't understand why you are asking here. Confusing.
>>
>>
>>>
>>> In del_nbp() we call br_stp_disable_port() to set the port state
>>> to BR_STATE_DISABLE and disabling learning events. But with this
>>> patch it can be deferred. Also note the STP agent may be in userspace
>>> which actually seems more likely the case because you likely want to
>>> run some more modern variant of STP than the kernel supports.
>>>
>>> So at some point in the future the driver will turn off learning. At
>>> the same time we call br_fdb_delete_by_port which calls a deferred
>>> set of fdb deletes.
>>>
>>> I don't see how you guarantee learning is off before you start doing
>>> the deletes here and possibly learning new addresses after the software
>>> side believes the port is down.
>>>
>>> So
>>>
>>>   br_stp_disable_port
>>>                           br_fdb_delete_by_port
>>>                           {fdb_del_external_learn}
>>>   [hw learns a fdb]
>>>   [hw disables learning]
>>>
>>> What stops this from happening?
>>
>> Okay. This behaviour is the same as without the patchset. What would
>> resolve the issue it to put switchdev_deferred_process() after
>> br_stp_disable_port() and before br_fdb_delete_by_port() call.
>> That would enforce stp change to happen in hw before fdbs are explicitly
>> deleted. Sound good to you?
> 
> Doesn't HW already see things in the right order since items are
> dequeued from the deferred list in the order queued?
> 

I just noticed there is the rtnl lock around the deferred process so
this should mean only a single deferred task is run at a time. OK sort
of a big hammer though.

+static void switchdev_deferred_process_work(struct work_struct *work)
+{
+	rtnl_lock();
+	switchdev_deferred_process();
+	rtnl_unlock();
+}

But I don't see how this block of code works,

        spin_lock_bh(&br->lock);
        br_stp_disable_port(p);
        spin_unlock_bh(&br->lock);

        br_ifinfo_notify(RTM_DELLINK, p);

        list_del_rcu(&p->list);

        nbp_vlan_flush(p);
        br_fdb_delete_by_port(br, p, 0, 1);
        switchdev_deferred_process();


If you defer disabling learning then setup a bunch of deletes also
deferred, what guarantees that learning was disabled before you setup
the deferred fdb delete list?

It seems you may call rocker_port_fdb_learn_work() after the
br_fdb_delete_by_port call above but before the deferred learn disable
call which would have disabled it. Also the rocker_port_fdb_learn_work
queue is on a lw->work and doesn't use the same locking?

Its sort of an interesting hieararchy of work queues being built up
here. I'm being a bit overly paranoid at this point because I've seen
some really ugly bugs around these types of things that are difficult
to spot because they are correctness bugs instead of hard crashes.

.John


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 17, 2015, 2:11 a.m. UTC | #7
On 15-10-16 01:23 AM, Jiri Pirko wrote:
> Thu, Oct 15, 2015 at 05:21:22PM CEST, john.fastabend@gmail.com wrote:
>> On 15-10-14 10:40 AM, Jiri Pirko wrote:
>>> From: Jiri Pirko <jiri@mellanox.com>
>>>
>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>> or if he has to defer the att_set processing for later.
>>>
>>> This also allows drivers to sleep inside attr_set and report operation
>>> status back to switchdev core. Switchdev core then warns if status is
>>> not ok, instead of silent errors happening in drivers.
>>>
>>> Benefit from newly introduced switchdev deferred ops infrastructure.
>>>
>>
>> A nit but the patch description should note your setting the defer bit
>> on the bridge set state.
>>
>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>> ---
>>>  include/net/switchdev.h   |   1 +
>>>  net/bridge/br_stp.c       |   3 +-
>>>  net/switchdev/switchdev.c | 108 ++++++++++++++++++----------------------------
>>>  3 files changed, 46 insertions(+), 66 deletions(-)
>>>
>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>> index d1c7f90..f7de6f8 100644
>>> --- a/include/net/switchdev.h
>>> +++ b/include/net/switchdev.h
>>> @@ -17,6 +17,7 @@
>>>  
>>>  #define SWITCHDEV_F_NO_RECURSE		BIT(0)
>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP	BIT(1)
>>> +#define SWITCHDEV_F_DEFER		BIT(2)
>>>  
>>>  struct switchdev_trans_item {
>>>  	struct list_head list;
>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>> index db6d243de..80c34d7 100644
>>> --- a/net/bridge/br_stp.c
>>> +++ b/net/bridge/br_stp.c
>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>  {
>>>  	struct switchdev_attr attr = {
>>>  		.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>> +		.flags = SWITCHDEV_F_DEFER,
>>>  		.u.stp_state = state,
>>>  	};
>>
>>
>> This creates a possible race (with 6/8) I think, please check!
> 
> Wait. This patch does not change the previous behaviour. Patch 6 does,
> so I don't understand why you are asking here. Confusing.
> 

Sorry if its confusing I keyed of the addition of the SWITCHDEV_F_DEFER
here.

> 
>>
>> In del_nbp() we call br_stp_disable_port() to set the port state
>> to BR_STATE_DISABLE and disabling learning events. But with this
>> patch it can be deferred. Also note the STP agent may be in userspace
>> which actually seems more likely the case because you likely want to
>> run some more modern variant of STP than the kernel supports.
>>
>> So at some point in the future the driver will turn off learning. At
>> the same time we call br_fdb_delete_by_port which calls a deferred
>> set of fdb deletes.
>>
>> I don't see how you guarantee learning is off before you start doing
>> the deletes here and possibly learning new addresses after the software
>> side believes the port is down.
>>
>> So
>>
>>   br_stp_disable_port
>>                           br_fdb_delete_by_port
>>                           {fdb_del_external_learn}
>>   [hw learns a fdb]
>>   [hw disables learning]
>>
>> What stops this from happening?
> 
> Okay. This behaviour is the same as without the patchset. What would
> resolve the issue it to put switchdev_deferred_process() after
> br_stp_disable_port() and before br_fdb_delete_by_port() call.
> That would enforce stp change to happen in hw before fdbs are explicitly
> deleted. Sound good to you?

OK so putting the switchdev_deferred_process() between the disable_port
and the delete_by_port will enforce the stp change to happen in hw
before the fdbs are explicitly deleted. I think this is minimally
required. I don't like scattering these flush_workqueue() calls all
over the place but I don't have any better ideas right now so sounds
good enough.

But now I'm wondering if you can have a deferred fdb add in the rocker
driver (rocker_port_fdb_learn_work) running in parallel with this that
could happen after the delete and add a bogus fdb entry. I think you
also need to have a flush in rocker_port_stp_update() to handle this
case.

Also I agree these issues were not completely caused by your patches.

Thanks,
John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d1c7f90..f7de6f8 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -17,6 +17,7 @@ 
 
 #define SWITCHDEV_F_NO_RECURSE		BIT(0)
 #define SWITCHDEV_F_SKIP_EOPNOTSUPP	BIT(1)
+#define SWITCHDEV_F_DEFER		BIT(2)
 
 struct switchdev_trans_item {
 	struct list_head list;
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index db6d243de..80c34d7 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -41,13 +41,14 @@  void br_set_state(struct net_bridge_port *p, unsigned int state)
 {
 	struct switchdev_attr attr = {
 		.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+		.flags = SWITCHDEV_F_DEFER,
 		.u.stp_state = state,
 	};
 	int err;
 
 	p->state = state;
 	err = switchdev_port_attr_set(p->dev, &attr);
-	if (err && err != -EOPNOTSUPP)
+	if (err)
 		br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
 				(unsigned int) p->port_no, p->dev->name);
 }
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 23b4e5b..007b8f4 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -250,75 +250,12 @@  done:
 	return err;
 }
 
-struct switchdev_attr_set_work {
-	struct work_struct work;
-	struct net_device *dev;
-	struct switchdev_attr attr;
-};
-
-static void switchdev_port_attr_set_work(struct work_struct *work)
-{
-	struct switchdev_attr_set_work *asw =
-		container_of(work, struct switchdev_attr_set_work, work);
-	int err;
-
-	rtnl_lock();
-	err = switchdev_port_attr_set(asw->dev, &asw->attr);
-	if (err && err != -EOPNOTSUPP)
-		netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n",
-			   err, asw->attr.id);
-	rtnl_unlock();
-
-	dev_put(asw->dev);
-	kfree(work);
-}
-
-static int switchdev_port_attr_set_defer(struct net_device *dev,
-					 const struct switchdev_attr *attr)
-{
-	struct switchdev_attr_set_work *asw;
-
-	asw = kmalloc(sizeof(*asw), GFP_ATOMIC);
-	if (!asw)
-		return -ENOMEM;
-
-	INIT_WORK(&asw->work, switchdev_port_attr_set_work);
-
-	dev_hold(dev);
-	asw->dev = dev;
-	memcpy(&asw->attr, attr, sizeof(asw->attr));
-
-	schedule_work(&asw->work);
-
-	return 0;
-}
-
-/**
- *	switchdev_port_attr_set - Set port attribute
- *
- *	@dev: port device
- *	@attr: attribute to set
- *
- *	Use a 2-phase prepare-commit transaction model to ensure
- *	system is not left in a partially updated state due to
- *	failure from driver/device.
- */
-int switchdev_port_attr_set(struct net_device *dev,
-			    const struct switchdev_attr *attr)
+static int switchdev_port_attr_set_now(struct net_device *dev,
+				       const struct switchdev_attr *attr)
 {
 	struct switchdev_trans trans;
 	int err;
 
-	if (!rtnl_is_locked()) {
-		/* Running prepare-commit transaction across stacked
-		 * devices requires nothing moves, so if rtnl_lock is
-		 * not held, schedule a worker thread to hold rtnl_lock
-		 * while setting attr.
-		 */
-
-		return switchdev_port_attr_set_defer(dev, attr);
-	}
-
 	switchdev_trans_init(&trans);
 
 	/* Phase I: prepare for attr set. Driver/device should fail
@@ -355,6 +292,47 @@  int switchdev_port_attr_set(struct net_device *dev,
 
 	return err;
 }
+
+static void switchdev_port_attr_set_deferred(struct net_device *dev,
+					     const void *data)
+{
+	const struct switchdev_attr *attr = data;
+	int err;
+
+	err = switchdev_port_attr_set_now(dev, attr);
+	if (err && err != -EOPNOTSUPP)
+		netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n",
+			   err, attr->id);
+}
+
+static int switchdev_port_attr_set_defer(struct net_device *dev,
+					 const struct switchdev_attr *attr)
+{
+	return switchdev_deferred_enqueue(dev, attr, sizeof(*attr),
+					  switchdev_port_attr_set_deferred);
+}
+
+/**
+ *	switchdev_port_attr_set - Set port attribute
+ *
+ *	@dev: port device
+ *	@attr: attribute to set
+ *
+ *	Use a 2-phase prepare-commit transaction model to ensure
+ *	system is not left in a partially updated state due to
+ *	failure from driver/device.
+ *
+ *	rtnl_lock must be held and must not be in atomic section,
+ *	in case SWITCHDEV_F_DEFER flag is not set.
+ */
+int switchdev_port_attr_set(struct net_device *dev,
+			    const struct switchdev_attr *attr)
+{
+	if (attr->flags & SWITCHDEV_F_DEFER)
+		return switchdev_port_attr_set_defer(dev, attr);
+	ASSERT_RTNL();
+	return switchdev_port_attr_set_now(dev, attr);
+}
 EXPORT_SYMBOL_GPL(switchdev_port_attr_set);
 
 static int __switchdev_port_obj_add(struct net_device *dev,