diff mbox

[net-next,v4,2/7] switchdev: allow caller to explicitly request attr_set as deferred

Message ID 1444672986-20709-1-git-send-email-jiri@resnulli.us
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Jiri Pirko Oct. 12, 2015, 6:03 p.m. UTC
From: Jiri Pirko <jiri@mellanox.com>

Caller should know if he can call attr_set directly (when holding RTNL)
or if he has to defer the att_set processing for later.

This also allows drivers to sleep inside attr_set and report operation
status back to switchdev core. Switchdev core then warns if status is
not ok, instead of silent errors happening in drivers.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/switchdev.h   |   1 +
 net/bridge/br_stp.c       |   3 +-
 net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
 3 files changed, 59 insertions(+), 52 deletions(-)

Comments

Scott Feldman Oct. 13, 2015, 2:52 a.m. UTC | #1
On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> From: Jiri Pirko <jiri@mellanox.com>
>
> Caller should know if he can call attr_set directly (when holding RTNL)
> or if he has to defer the att_set processing for later.
>
> This also allows drivers to sleep inside attr_set and report operation
> status back to switchdev core. Switchdev core then warns if status is
> not ok, instead of silent errors happening in drivers.
>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/net/switchdev.h   |   1 +
>  net/bridge/br_stp.c       |   3 +-
>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>  3 files changed, 59 insertions(+), 52 deletions(-)
>
> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> index d2879f2..6b109e4 100644
> --- a/include/net/switchdev.h
> +++ b/include/net/switchdev.h
> @@ -17,6 +17,7 @@
>
>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
> +#define SWITCHDEV_F_DEFER              BIT(2)
>
>  struct switchdev_trans_item {
>         struct list_head list;
> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
> index db6d243de..80c34d7 100644
> --- a/net/bridge/br_stp.c
> +++ b/net/bridge/br_stp.c
> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>  {
>         struct switchdev_attr attr = {
>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
> +               .flags = SWITCHDEV_F_DEFER,
>                 .u.stp_state = state,
>         };
>         int err;
>
>         p->state = state;
>         err = switchdev_port_attr_set(p->dev, &attr);
> -       if (err && err != -EOPNOTSUPP)
> +       if (err)

This looks like a problem as now all other non-switchdev ports will
get an WARN in the log when STP state changes.  We should only WARN if
there was an err and the err is not -EOPNOTSUPP.

>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>                                 (unsigned int) p->port_no, p->dev->name);
>  }

<snip>

>  struct switchdev_attr_set_work {
>         struct work_struct work;
>         struct net_device *dev;
> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>  {
>         struct switchdev_attr_set_work *asw =
>                 container_of(work, struct switchdev_attr_set_work, work);
> +       bool rtnl_locked = rtnl_is_locked();
>         int err;
>
> -       rtnl_lock();
> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
> +       if (!rtnl_locked)
> +               rtnl_lock();

I'm not following this change.  If someone else has rtnl_lock, we'll
not wait to grab it here ourselves, and proceed as if we have the
lock.  But what if that someone else releases the lock in the middle
of us doing switchdev_port_attr_set_now?  Seems we want to
unconditionally wait and grab the lock.  We need to block anything
from moving while we do the attr set.

> +       err = switchdev_port_attr_set_now(asw->dev, &asw->attr);
>         if (err && err != -EOPNOTSUPP)
>                 netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n",
>                            err, asw->attr.id);
> -       rtnl_unlock();
> +       if (!rtnl_locked)
> +               rtnl_unlock();
>
>         dev_put(asw->dev);
>         kfree(work);
> @@ -211,7 +257,7 @@ static int switchdev_port_attr_set_defer(struct net_device *dev,
>         asw->dev = dev;
>         memcpy(&asw->attr, attr, sizeof(asw->attr));
>
> -       schedule_work(&asw->work);
> +       queue_work(switchdev_wq, &asw->work);
>
>         return 0;
>  }
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 13, 2015, 4:40 a.m. UTC | #2
On 15-10-12 07:52 PM, Scott Feldman wrote:
> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>>
>> Caller should know if he can call attr_set directly (when holding RTNL)
>> or if he has to defer the att_set processing for later.
>>
>> This also allows drivers to sleep inside attr_set and report operation
>> status back to switchdev core. Switchdev core then warns if status is
>> not ok, instead of silent errors happening in drivers.
>>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  include/net/switchdev.h   |   1 +
>>  net/bridge/br_stp.c       |   3 +-
>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>
>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>> index d2879f2..6b109e4 100644
>> --- a/include/net/switchdev.h
>> +++ b/include/net/switchdev.h
>> @@ -17,6 +17,7 @@
>>
>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>
>>  struct switchdev_trans_item {
>>         struct list_head list;
>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>> index db6d243de..80c34d7 100644
>> --- a/net/bridge/br_stp.c
>> +++ b/net/bridge/br_stp.c
>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>  {
>>         struct switchdev_attr attr = {
>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>> +               .flags = SWITCHDEV_F_DEFER,
>>                 .u.stp_state = state,
>>         };
>>         int err;
>>
>>         p->state = state;
>>         err = switchdev_port_attr_set(p->dev, &attr);
>> -       if (err && err != -EOPNOTSUPP)
>> +       if (err)
> 
> This looks like a problem as now all other non-switchdev ports will
> get an WARN in the log when STP state changes.  We should only WARN if
> there was an err and the err is not -EOPNOTSUPP.
> 
>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>                                 (unsigned int) p->port_no, p->dev->name);
>>  }
> 
> <snip>
> 
>>  struct switchdev_attr_set_work {
>>         struct work_struct work;
>>         struct net_device *dev;
>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>  {
>>         struct switchdev_attr_set_work *asw =
>>                 container_of(work, struct switchdev_attr_set_work, work);
>> +       bool rtnl_locked = rtnl_is_locked();
>>         int err;
>>
>> -       rtnl_lock();
>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>> +       if (!rtnl_locked)
>> +               rtnl_lock();
> 
> I'm not following this change.  If someone else has rtnl_lock, we'll
> not wait to grab it here ourselves, and proceed as if we have the
> lock.  But what if that someone else releases the lock in the middle
> of us doing switchdev_port_attr_set_now?  Seems we want to
> unconditionally wait and grab the lock.  We need to block anything
> from moving while we do the attr set.
> 

Also an additional race between setting rtnl_locked and the if stmt
and then grabbing the lock. There seems to be a something of pattern
around this where other subsystems use a rtnl_trylock and if it fails
do a restart/re-queue operation to retry. Looks like how you handle
it in the team driver at least.

>> +       err = switchdev_port_attr_set_now(asw->dev, &asw->attr);
>>         if (err && err != -EOPNOTSUPP)
>>                 netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n",
>>                            err, asw->attr.id);
>> -       rtnl_unlock();
>> +       if (!rtnl_locked)
>> +               rtnl_unlock();
>>
>>         dev_put(asw->dev);
>>         kfree(work);
>> @@ -211,7 +257,7 @@ static int switchdev_port_attr_set_defer(struct net_device *dev,
>>         asw->dev = dev;
>>         memcpy(&asw->attr, attr, sizeof(asw->attr));
>>
>> -       schedule_work(&asw->work);
>> +       queue_work(switchdev_wq, &asw->work);
>>
>>         return 0;
>>  }

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Oct. 13, 2015, 5:44 a.m. UTC | #3
Tue, Oct 13, 2015 at 04:52:42AM CEST, sfeldma@gmail.com wrote:
>On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>>
>> Caller should know if he can call attr_set directly (when holding RTNL)
>> or if he has to defer the att_set processing for later.
>>
>> This also allows drivers to sleep inside attr_set and report operation
>> status back to switchdev core. Switchdev core then warns if status is
>> not ok, instead of silent errors happening in drivers.
>>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  include/net/switchdev.h   |   1 +
>>  net/bridge/br_stp.c       |   3 +-
>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>
>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>> index d2879f2..6b109e4 100644
>> --- a/include/net/switchdev.h
>> +++ b/include/net/switchdev.h
>> @@ -17,6 +17,7 @@
>>
>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>
>>  struct switchdev_trans_item {
>>         struct list_head list;
>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>> index db6d243de..80c34d7 100644
>> --- a/net/bridge/br_stp.c
>> +++ b/net/bridge/br_stp.c
>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>  {
>>         struct switchdev_attr attr = {
>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>> +               .flags = SWITCHDEV_F_DEFER,
>>                 .u.stp_state = state,
>>         };
>>         int err;
>>
>>         p->state = state;
>>         err = switchdev_port_attr_set(p->dev, &attr);
>> -       if (err && err != -EOPNOTSUPP)
>> +       if (err)
>
>This looks like a problem as now all other non-switchdev ports will
>get an WARN in the log when STP state changes.  We should only WARN if
>there was an err and the err is not -EOPNOTSUPP.

If SWITCHDEV_F_DEFER flag is set, there's only 0 of -ENOMEM.


>
>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>                                 (unsigned int) p->port_no, p->dev->name);
>>  }
>
><snip>
>
>>  struct switchdev_attr_set_work {
>>         struct work_struct work;
>>         struct net_device *dev;
>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>  {
>>         struct switchdev_attr_set_work *asw =
>>                 container_of(work, struct switchdev_attr_set_work, work);
>> +       bool rtnl_locked = rtnl_is_locked();
>>         int err;
>>
>> -       rtnl_lock();
>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>> +       if (!rtnl_locked)
>> +               rtnl_lock();
>
>I'm not following this change.  If someone else has rtnl_lock, we'll
>not wait to grab it here ourselves, and proceed as if we have the
>lock.  But what if that someone else releases the lock in the middle
>of us doing switchdev_port_attr_set_now?  Seems we want to
>unconditionally wait and grab the lock.  We need to block anything
>from moving while we do the attr set.

Why would someone we call (driver) return the lock? In that case, he is
buggy and should be fixed.

This hunk only ensures we have rtnl_lock. If not, we take it here. We do
not take it unconditionally because we may already have it, for example
if caller of switchdev_flush_deferred holds rtnl_lock.



>
>> +       err = switchdev_port_attr_set_now(asw->dev, &asw->attr);
>>         if (err && err != -EOPNOTSUPP)
>>                 netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n",
>>                            err, asw->attr.id);
>> -       rtnl_unlock();
>> +       if (!rtnl_locked)
>> +               rtnl_unlock();
>>
>>         dev_put(asw->dev);
>>         kfree(work);
>> @@ -211,7 +257,7 @@ static int switchdev_port_attr_set_defer(struct net_device *dev,
>>         asw->dev = dev;
>>         memcpy(&asw->attr, attr, sizeof(asw->attr));
>>
>> -       schedule_work(&asw->work);
>> +       queue_work(switchdev_wq, &asw->work);
>>
>>         return 0;
>>  }
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Oct. 13, 2015, 5:45 a.m. UTC | #4
Tue, Oct 13, 2015 at 06:40:25AM CEST, john.fastabend@gmail.com wrote:
>On 15-10-12 07:52 PM, Scott Feldman wrote:
>> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> From: Jiri Pirko <jiri@mellanox.com>
>>>
>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>> or if he has to defer the att_set processing for later.
>>>
>>> This also allows drivers to sleep inside attr_set and report operation
>>> status back to switchdev core. Switchdev core then warns if status is
>>> not ok, instead of silent errors happening in drivers.
>>>
>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>> ---
>>>  include/net/switchdev.h   |   1 +
>>>  net/bridge/br_stp.c       |   3 +-
>>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>>
>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>> index d2879f2..6b109e4 100644
>>> --- a/include/net/switchdev.h
>>> +++ b/include/net/switchdev.h
>>> @@ -17,6 +17,7 @@
>>>
>>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>>
>>>  struct switchdev_trans_item {
>>>         struct list_head list;
>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>> index db6d243de..80c34d7 100644
>>> --- a/net/bridge/br_stp.c
>>> +++ b/net/bridge/br_stp.c
>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>  {
>>>         struct switchdev_attr attr = {
>>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>> +               .flags = SWITCHDEV_F_DEFER,
>>>                 .u.stp_state = state,
>>>         };
>>>         int err;
>>>
>>>         p->state = state;
>>>         err = switchdev_port_attr_set(p->dev, &attr);
>>> -       if (err && err != -EOPNOTSUPP)
>>> +       if (err)
>> 
>> This looks like a problem as now all other non-switchdev ports will
>> get an WARN in the log when STP state changes.  We should only WARN if
>> there was an err and the err is not -EOPNOTSUPP.
>> 
>>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>>                                 (unsigned int) p->port_no, p->dev->name);
>>>  }
>> 
>> <snip>
>> 
>>>  struct switchdev_attr_set_work {
>>>         struct work_struct work;
>>>         struct net_device *dev;
>>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>>  {
>>>         struct switchdev_attr_set_work *asw =
>>>                 container_of(work, struct switchdev_attr_set_work, work);
>>> +       bool rtnl_locked = rtnl_is_locked();
>>>         int err;
>>>
>>> -       rtnl_lock();
>>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>>> +       if (!rtnl_locked)
>>> +               rtnl_lock();
>> 
>> I'm not following this change.  If someone else has rtnl_lock, we'll
>> not wait to grab it here ourselves, and proceed as if we have the
>> lock.  But what if that someone else releases the lock in the middle
>> of us doing switchdev_port_attr_set_now?  Seems we want to
>> unconditionally wait and grab the lock.  We need to block anything
>> from moving while we do the attr set.
>> 
>
>Also an additional race between setting rtnl_locked and the if stmt
>and then grabbing the lock. There seems to be a something of pattern
>around this where other subsystems use a rtnl_trylock and if it fails
>do a restart/re-queue operation to retry. Looks like how you handle
>it in the team driver at least.

No, this is for different case. This is for case someone calls
switchdev_flush_defererd holding the rtnl_lock.


>
>>> +       err = switchdev_port_attr_set_now(asw->dev, &asw->attr);
>>>         if (err && err != -EOPNOTSUPP)
>>>                 netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n",
>>>                            err, asw->attr.id);
>>> -       rtnl_unlock();
>>> +       if (!rtnl_locked)
>>> +               rtnl_unlock();
>>>
>>>         dev_put(asw->dev);
>>>         kfree(work);
>>> @@ -211,7 +257,7 @@ static int switchdev_port_attr_set_defer(struct net_device *dev,
>>>         asw->dev = dev;
>>>         memcpy(&asw->attr, attr, sizeof(asw->attr));
>>>
>>> -       schedule_work(&asw->work);
>>> +       queue_work(switchdev_wq, &asw->work);
>>>
>>>         return 0;
>>>  }
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 13, 2015, 5:48 a.m. UTC | #5
On 15-10-12 10:45 PM, Jiri Pirko wrote:
> Tue, Oct 13, 2015 at 06:40:25AM CEST, john.fastabend@gmail.com wrote:
>> On 15-10-12 07:52 PM, Scott Feldman wrote:
>>> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>
>>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>>> or if he has to defer the att_set processing for later.
>>>>
>>>> This also allows drivers to sleep inside attr_set and report operation
>>>> status back to switchdev core. Switchdev core then warns if status is
>>>> not ok, instead of silent errors happening in drivers.
>>>>
>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>> ---
>>>>  include/net/switchdev.h   |   1 +
>>>>  net/bridge/br_stp.c       |   3 +-
>>>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>>>
>>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>>> index d2879f2..6b109e4 100644
>>>> --- a/include/net/switchdev.h
>>>> +++ b/include/net/switchdev.h
>>>> @@ -17,6 +17,7 @@
>>>>
>>>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>>>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>>>
>>>>  struct switchdev_trans_item {
>>>>         struct list_head list;
>>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>>> index db6d243de..80c34d7 100644
>>>> --- a/net/bridge/br_stp.c
>>>> +++ b/net/bridge/br_stp.c
>>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>>  {
>>>>         struct switchdev_attr attr = {
>>>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>>> +               .flags = SWITCHDEV_F_DEFER,
>>>>                 .u.stp_state = state,
>>>>         };
>>>>         int err;
>>>>
>>>>         p->state = state;
>>>>         err = switchdev_port_attr_set(p->dev, &attr);
>>>> -       if (err && err != -EOPNOTSUPP)
>>>> +       if (err)
>>>
>>> This looks like a problem as now all other non-switchdev ports will
>>> get an WARN in the log when STP state changes.  We should only WARN if
>>> there was an err and the err is not -EOPNOTSUPP.
>>>
>>>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>>>                                 (unsigned int) p->port_no, p->dev->name);
>>>>  }
>>>
>>> <snip>
>>>
>>>>  struct switchdev_attr_set_work {
>>>>         struct work_struct work;
>>>>         struct net_device *dev;
>>>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>>>  {
>>>>         struct switchdev_attr_set_work *asw =
>>>>                 container_of(work, struct switchdev_attr_set_work, work);
>>>> +       bool rtnl_locked = rtnl_is_locked();
>>>>         int err;
>>>>
>>>> -       rtnl_lock();
>>>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>>>> +       if (!rtnl_locked)
>>>> +               rtnl_lock();
>>>
>>> I'm not following this change.  If someone else has rtnl_lock, we'll
>>> not wait to grab it here ourselves, and proceed as if we have the
>>> lock.  But what if that someone else releases the lock in the middle
>>> of us doing switchdev_port_attr_set_now?  Seems we want to
>>> unconditionally wait and grab the lock.  We need to block anything
>>> from moving while we do the attr set.
>>>
>>
>> Also an additional race between setting rtnl_locked and the if stmt
>> and then grabbing the lock. There seems to be a something of pattern
>> around this where other subsystems use a rtnl_trylock and if it fails
>> do a restart/re-queue operation to retry. Looks like how you handle
>> it in the team driver at least.
> 
> No, this is for different case. This is for case someone calls
> switchdev_flush_defererd holding the rtnl_lock.
> 

OK rather than funky if stmt could you just do a rtnl_trylock() and
put a comment explaining the reasoning?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 13, 2015, 6:03 a.m. UTC | #6
On 15-10-12 10:44 PM, Jiri Pirko wrote:
> Tue, Oct 13, 2015 at 04:52:42AM CEST, sfeldma@gmail.com wrote:
>> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> From: Jiri Pirko <jiri@mellanox.com>
>>>
>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>> or if he has to defer the att_set processing for later.
>>>
>>> This also allows drivers to sleep inside attr_set and report operation
>>> status back to switchdev core. Switchdev core then warns if status is
>>> not ok, instead of silent errors happening in drivers.
>>>
>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>> ---
>>>  include/net/switchdev.h   |   1 +
>>>  net/bridge/br_stp.c       |   3 +-
>>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>>
>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>> index d2879f2..6b109e4 100644
>>> --- a/include/net/switchdev.h
>>> +++ b/include/net/switchdev.h
>>> @@ -17,6 +17,7 @@
>>>
>>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>>
>>>  struct switchdev_trans_item {
>>>         struct list_head list;
>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>> index db6d243de..80c34d7 100644
>>> --- a/net/bridge/br_stp.c
>>> +++ b/net/bridge/br_stp.c
>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>  {
>>>         struct switchdev_attr attr = {
>>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>> +               .flags = SWITCHDEV_F_DEFER,
>>>                 .u.stp_state = state,
>>>         };
>>>         int err;
>>>
>>>         p->state = state;
>>>         err = switchdev_port_attr_set(p->dev, &attr);
>>> -       if (err && err != -EOPNOTSUPP)
>>> +       if (err)
>>
>> This looks like a problem as now all other non-switchdev ports will
>> get an WARN in the log when STP state changes.  We should only WARN if
>> there was an err and the err is not -EOPNOTSUPP.
> 
> If SWITCHDEV_F_DEFER flag is set, there's only 0 of -ENOMEM.
> 
> 
>>
>>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>>                                 (unsigned int) p->port_no, p->dev->name);
>>>  }
>>
>> <snip>
>>
>>>  struct switchdev_attr_set_work {
>>>         struct work_struct work;
>>>         struct net_device *dev;
>>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>>  {
>>>         struct switchdev_attr_set_work *asw =
>>>                 container_of(work, struct switchdev_attr_set_work, work);
>>> +       bool rtnl_locked = rtnl_is_locked();
>>>         int err;
>>>
>>> -       rtnl_lock();
>>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>>> +       if (!rtnl_locked)
>>> +               rtnl_lock();
>>
>> I'm not following this change.  If someone else has rtnl_lock, we'll
>> not wait to grab it here ourselves, and proceed as if we have the
>> lock.  But what if that someone else releases the lock in the middle
>> of us doing switchdev_port_attr_set_now?  Seems we want to
>> unconditionally wait and grab the lock.  We need to block anything
>>from moving while we do the attr set.
> 
> Why would someone we call (driver) return the lock? In that case, he is
> buggy and should be fixed.
> 
> This hunk only ensures we have rtnl_lock. If not, we take it here. We do
> not take it unconditionally because we may already have it, for example
> if caller of switchdev_flush_deferred holds rtnl_lock.
> 

This is where you lost me. How do you know another core doesn't happen
to have the lock when you hit this code path? Maybe someone is running
an ethtool command on another core or something.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Oct. 13, 2015, 6:21 a.m. UTC | #7
Tue, Oct 13, 2015 at 08:03:46AM CEST, john.fastabend@gmail.com wrote:
>On 15-10-12 10:44 PM, Jiri Pirko wrote:
>> Tue, Oct 13, 2015 at 04:52:42AM CEST, sfeldma@gmail.com wrote:
>>> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>
>>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>>> or if he has to defer the att_set processing for later.
>>>>
>>>> This also allows drivers to sleep inside attr_set and report operation
>>>> status back to switchdev core. Switchdev core then warns if status is
>>>> not ok, instead of silent errors happening in drivers.
>>>>
>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>> ---
>>>>  include/net/switchdev.h   |   1 +
>>>>  net/bridge/br_stp.c       |   3 +-
>>>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>>>
>>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>>> index d2879f2..6b109e4 100644
>>>> --- a/include/net/switchdev.h
>>>> +++ b/include/net/switchdev.h
>>>> @@ -17,6 +17,7 @@
>>>>
>>>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>>>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>>>
>>>>  struct switchdev_trans_item {
>>>>         struct list_head list;
>>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>>> index db6d243de..80c34d7 100644
>>>> --- a/net/bridge/br_stp.c
>>>> +++ b/net/bridge/br_stp.c
>>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>>  {
>>>>         struct switchdev_attr attr = {
>>>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>>> +               .flags = SWITCHDEV_F_DEFER,
>>>>                 .u.stp_state = state,
>>>>         };
>>>>         int err;
>>>>
>>>>         p->state = state;
>>>>         err = switchdev_port_attr_set(p->dev, &attr);
>>>> -       if (err && err != -EOPNOTSUPP)
>>>> +       if (err)
>>>
>>> This looks like a problem as now all other non-switchdev ports will
>>> get an WARN in the log when STP state changes.  We should only WARN if
>>> there was an err and the err is not -EOPNOTSUPP.
>> 
>> If SWITCHDEV_F_DEFER flag is set, there's only 0 of -ENOMEM.
>> 
>> 
>>>
>>>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>>>                                 (unsigned int) p->port_no, p->dev->name);
>>>>  }
>>>
>>> <snip>
>>>
>>>>  struct switchdev_attr_set_work {
>>>>         struct work_struct work;
>>>>         struct net_device *dev;
>>>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>>>  {
>>>>         struct switchdev_attr_set_work *asw =
>>>>                 container_of(work, struct switchdev_attr_set_work, work);
>>>> +       bool rtnl_locked = rtnl_is_locked();
>>>>         int err;
>>>>
>>>> -       rtnl_lock();
>>>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>>>> +       if (!rtnl_locked)
>>>> +               rtnl_lock();
>>>
>>> I'm not following this change.  If someone else has rtnl_lock, we'll
>>> not wait to grab it here ourselves, and proceed as if we have the
>>> lock.  But what if that someone else releases the lock in the middle
>>> of us doing switchdev_port_attr_set_now?  Seems we want to
>>> unconditionally wait and grab the lock.  We need to block anything
>>>from moving while we do the attr set.
>> 
>> Why would someone we call (driver) return the lock? In that case, he is
>> buggy and should be fixed.
>> 
>> This hunk only ensures we have rtnl_lock. If not, we take it here. We do
>> not take it unconditionally because we may already have it, for example
>> if caller of switchdev_flush_deferred holds rtnl_lock.
>> 
>
>This is where you lost me. How do you know another core doesn't happen
>to have the lock when you hit this code path? Maybe someone is running
>an ethtool command on another core or something.

You are right. The same problem exists currently in switchdev_port_attr_set.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman Oct. 13, 2015, 6:53 a.m. UTC | #8
On Mon, Oct 12, 2015 at 11:21 PM, Jiri Pirko <jiri@resnulli.us> wrote:
> Tue, Oct 13, 2015 at 08:03:46AM CEST, john.fastabend@gmail.com wrote:
>>On 15-10-12 10:44 PM, Jiri Pirko wrote:
>>> Tue, Oct 13, 2015 at 04:52:42AM CEST, sfeldma@gmail.com wrote:
>>>> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>>
>>>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>>>> or if he has to defer the att_set processing for later.
>>>>>
>>>>> This also allows drivers to sleep inside attr_set and report operation
>>>>> status back to switchdev core. Switchdev core then warns if status is
>>>>> not ok, instead of silent errors happening in drivers.
>>>>>
>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>> ---
>>>>>  include/net/switchdev.h   |   1 +
>>>>>  net/bridge/br_stp.c       |   3 +-
>>>>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>>>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>>>>
>>>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>>>> index d2879f2..6b109e4 100644
>>>>> --- a/include/net/switchdev.h
>>>>> +++ b/include/net/switchdev.h
>>>>> @@ -17,6 +17,7 @@
>>>>>
>>>>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>>>>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>>>>
>>>>>  struct switchdev_trans_item {
>>>>>         struct list_head list;
>>>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>>>> index db6d243de..80c34d7 100644
>>>>> --- a/net/bridge/br_stp.c
>>>>> +++ b/net/bridge/br_stp.c
>>>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>>>  {
>>>>>         struct switchdev_attr attr = {
>>>>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>>>> +               .flags = SWITCHDEV_F_DEFER,
>>>>>                 .u.stp_state = state,
>>>>>         };
>>>>>         int err;
>>>>>
>>>>>         p->state = state;
>>>>>         err = switchdev_port_attr_set(p->dev, &attr);
>>>>> -       if (err && err != -EOPNOTSUPP)
>>>>> +       if (err)
>>>>
>>>> This looks like a problem as now all other non-switchdev ports will
>>>> get an WARN in the log when STP state changes.  We should only WARN if
>>>> there was an err and the err is not -EOPNOTSUPP.
>>>
>>> If SWITCHDEV_F_DEFER flag is set, there's only 0 of -ENOMEM.
>>>
>>>
>>>>
>>>>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>>>>                                 (unsigned int) p->port_no, p->dev->name);
>>>>>  }
>>>>
>>>> <snip>
>>>>
>>>>>  struct switchdev_attr_set_work {
>>>>>         struct work_struct work;
>>>>>         struct net_device *dev;
>>>>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>>>>  {
>>>>>         struct switchdev_attr_set_work *asw =
>>>>>                 container_of(work, struct switchdev_attr_set_work, work);
>>>>> +       bool rtnl_locked = rtnl_is_locked();
>>>>>         int err;
>>>>>
>>>>> -       rtnl_lock();
>>>>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>>>>> +       if (!rtnl_locked)
>>>>> +               rtnl_lock();
>>>>
>>>> I'm not following this change.  If someone else has rtnl_lock, we'll
>>>> not wait to grab it here ourselves, and proceed as if we have the
>>>> lock.  But what if that someone else releases the lock in the middle
>>>> of us doing switchdev_port_attr_set_now?  Seems we want to
>>>> unconditionally wait and grab the lock.  We need to block anything
>>>>from moving while we do the attr set.
>>>
>>> Why would someone we call (driver) return the lock? In that case, he is
>>> buggy and should be fixed.
>>>
>>> This hunk only ensures we have rtnl_lock. If not, we take it here. We do
>>> not take it unconditionally because we may already have it, for example
>>> if caller of switchdev_flush_deferred holds rtnl_lock.
>>>
>>
>>This is where you lost me. How do you know another core doesn't happen
>>to have the lock when you hit this code path? Maybe someone is running
>>an ethtool command on another core or something.
>
> You are right. The same problem exists currently in switchdev_port_attr_set.

You are right as in you'll change this back to unconditional grabbing
of rtnl_lock?  I don't follow how this problem currently exists as
current code does an unconditional grab of rtnl_lock.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Oct. 13, 2015, 7:30 a.m. UTC | #9
Tue, Oct 13, 2015 at 08:53:45AM CEST, sfeldma@gmail.com wrote:
>On Mon, Oct 12, 2015 at 11:21 PM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Tue, Oct 13, 2015 at 08:03:46AM CEST, john.fastabend@gmail.com wrote:
>>>On 15-10-12 10:44 PM, Jiri Pirko wrote:
>>>> Tue, Oct 13, 2015 at 04:52:42AM CEST, sfeldma@gmail.com wrote:
>>>>> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>>>
>>>>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>>>>> or if he has to defer the att_set processing for later.
>>>>>>
>>>>>> This also allows drivers to sleep inside attr_set and report operation
>>>>>> status back to switchdev core. Switchdev core then warns if status is
>>>>>> not ok, instead of silent errors happening in drivers.
>>>>>>
>>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>>> ---
>>>>>>  include/net/switchdev.h   |   1 +
>>>>>>  net/bridge/br_stp.c       |   3 +-
>>>>>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>>>>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>>>>>
>>>>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>>>>> index d2879f2..6b109e4 100644
>>>>>> --- a/include/net/switchdev.h
>>>>>> +++ b/include/net/switchdev.h
>>>>>> @@ -17,6 +17,7 @@
>>>>>>
>>>>>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>>>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>>>>>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>>>>>
>>>>>>  struct switchdev_trans_item {
>>>>>>         struct list_head list;
>>>>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>>>>> index db6d243de..80c34d7 100644
>>>>>> --- a/net/bridge/br_stp.c
>>>>>> +++ b/net/bridge/br_stp.c
>>>>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>>>>  {
>>>>>>         struct switchdev_attr attr = {
>>>>>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>>>>> +               .flags = SWITCHDEV_F_DEFER,
>>>>>>                 .u.stp_state = state,
>>>>>>         };
>>>>>>         int err;
>>>>>>
>>>>>>         p->state = state;
>>>>>>         err = switchdev_port_attr_set(p->dev, &attr);
>>>>>> -       if (err && err != -EOPNOTSUPP)
>>>>>> +       if (err)
>>>>>
>>>>> This looks like a problem as now all other non-switchdev ports will
>>>>> get an WARN in the log when STP state changes.  We should only WARN if
>>>>> there was an err and the err is not -EOPNOTSUPP.
>>>>
>>>> If SWITCHDEV_F_DEFER flag is set, there's only 0 of -ENOMEM.
>>>>
>>>>
>>>>>
>>>>>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>>>>>                                 (unsigned int) p->port_no, p->dev->name);
>>>>>>  }
>>>>>
>>>>> <snip>
>>>>>
>>>>>>  struct switchdev_attr_set_work {
>>>>>>         struct work_struct work;
>>>>>>         struct net_device *dev;
>>>>>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>>>>>  {
>>>>>>         struct switchdev_attr_set_work *asw =
>>>>>>                 container_of(work, struct switchdev_attr_set_work, work);
>>>>>> +       bool rtnl_locked = rtnl_is_locked();
>>>>>>         int err;
>>>>>>
>>>>>> -       rtnl_lock();
>>>>>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>>>>>> +       if (!rtnl_locked)
>>>>>> +               rtnl_lock();
>>>>>
>>>>> I'm not following this change.  If someone else has rtnl_lock, we'll
>>>>> not wait to grab it here ourselves, and proceed as if we have the
>>>>> lock.  But what if that someone else releases the lock in the middle
>>>>> of us doing switchdev_port_attr_set_now?  Seems we want to
>>>>> unconditionally wait and grab the lock.  We need to block anything
>>>>>from moving while we do the attr set.
>>>>
>>>> Why would someone we call (driver) return the lock? In that case, he is
>>>> buggy and should be fixed.
>>>>
>>>> This hunk only ensures we have rtnl_lock. If not, we take it here. We do
>>>> not take it unconditionally because we may already have it, for example
>>>> if caller of switchdev_flush_deferred holds rtnl_lock.
>>>>
>>>
>>>This is where you lost me. How do you know another core doesn't happen
>>>to have the lock when you hit this code path? Maybe someone is running
>>>an ethtool command on another core or something.
>>
>> You are right. The same problem exists currently in switchdev_port_attr_set.
>
>You are right as in you'll change this back to unconditional grabbing
>of rtnl_lock?  I don't follow how this problem currently exists as
>current code does an unconditional grab of rtnl_lock.

  cpu1				cpu2
  				rtnl_lock()
switchdev_port_attr_set
  !rtnl_is_locked() == false
  switchdev_trans_init
 				rtnl_unlock()
  __switchdev_port_attr_set

now __switchdev_port_attr_set is called without rtnl_lock.

Would make sense to introduce rtnl_is_locked_by_me() or something.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman Oct. 13, 2015, 2:07 p.m. UTC | #10
On Tue, Oct 13, 2015 at 12:30 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Tue, Oct 13, 2015 at 08:53:45AM CEST, sfeldma@gmail.com wrote:
>>On Mon, Oct 12, 2015 at 11:21 PM, Jiri Pirko <jiri@resnulli.us> wrote:
>>> Tue, Oct 13, 2015 at 08:03:46AM CEST, john.fastabend@gmail.com wrote:
>>>>On 15-10-12 10:44 PM, Jiri Pirko wrote:
>>>>> Tue, Oct 13, 2015 at 04:52:42AM CEST, sfeldma@gmail.com wrote:
>>>>>> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>>>>
>>>>>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>>>>>> or if he has to defer the att_set processing for later.
>>>>>>>
>>>>>>> This also allows drivers to sleep inside attr_set and report operation
>>>>>>> status back to switchdev core. Switchdev core then warns if status is
>>>>>>> not ok, instead of silent errors happening in drivers.
>>>>>>>
>>>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>>>> ---
>>>>>>>  include/net/switchdev.h   |   1 +
>>>>>>>  net/bridge/br_stp.c       |   3 +-
>>>>>>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>>>>>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>>>>>> index d2879f2..6b109e4 100644
>>>>>>> --- a/include/net/switchdev.h
>>>>>>> +++ b/include/net/switchdev.h
>>>>>>> @@ -17,6 +17,7 @@
>>>>>>>
>>>>>>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>>>>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>>>>>>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>>>>>>
>>>>>>>  struct switchdev_trans_item {
>>>>>>>         struct list_head list;
>>>>>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>>>>>> index db6d243de..80c34d7 100644
>>>>>>> --- a/net/bridge/br_stp.c
>>>>>>> +++ b/net/bridge/br_stp.c
>>>>>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>>>>>  {
>>>>>>>         struct switchdev_attr attr = {
>>>>>>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>>>>>> +               .flags = SWITCHDEV_F_DEFER,
>>>>>>>                 .u.stp_state = state,
>>>>>>>         };
>>>>>>>         int err;
>>>>>>>
>>>>>>>         p->state = state;
>>>>>>>         err = switchdev_port_attr_set(p->dev, &attr);
>>>>>>> -       if (err && err != -EOPNOTSUPP)
>>>>>>> +       if (err)
>>>>>>
>>>>>> This looks like a problem as now all other non-switchdev ports will
>>>>>> get an WARN in the log when STP state changes.  We should only WARN if
>>>>>> there was an err and the err is not -EOPNOTSUPP.
>>>>>
>>>>> If SWITCHDEV_F_DEFER flag is set, there's only 0 of -ENOMEM.
>>>>>
>>>>>
>>>>>>
>>>>>>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>>>>>>                                 (unsigned int) p->port_no, p->dev->name);
>>>>>>>  }
>>>>>>
>>>>>> <snip>
>>>>>>
>>>>>>>  struct switchdev_attr_set_work {
>>>>>>>         struct work_struct work;
>>>>>>>         struct net_device *dev;
>>>>>>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>>>>>>  {
>>>>>>>         struct switchdev_attr_set_work *asw =
>>>>>>>                 container_of(work, struct switchdev_attr_set_work, work);
>>>>>>> +       bool rtnl_locked = rtnl_is_locked();
>>>>>>>         int err;
>>>>>>>
>>>>>>> -       rtnl_lock();
>>>>>>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>>>>>>> +       if (!rtnl_locked)
>>>>>>> +               rtnl_lock();
>>>>>>
>>>>>> I'm not following this change.  If someone else has rtnl_lock, we'll
>>>>>> not wait to grab it here ourselves, and proceed as if we have the
>>>>>> lock.  But what if that someone else releases the lock in the middle
>>>>>> of us doing switchdev_port_attr_set_now?  Seems we want to
>>>>>> unconditionally wait and grab the lock.  We need to block anything
>>>>>>from moving while we do the attr set.
>>>>>
>>>>> Why would someone we call (driver) return the lock? In that case, he is
>>>>> buggy and should be fixed.
>>>>>
>>>>> This hunk only ensures we have rtnl_lock. If not, we take it here. We do
>>>>> not take it unconditionally because we may already have it, for example
>>>>> if caller of switchdev_flush_deferred holds rtnl_lock.
>>>>>
>>>>
>>>>This is where you lost me. How do you know another core doesn't happen
>>>>to have the lock when you hit this code path? Maybe someone is running
>>>>an ethtool command on another core or something.
>>>
>>> You are right. The same problem exists currently in switchdev_port_attr_set.
>>
>>You are right as in you'll change this back to unconditional grabbing
>>of rtnl_lock?  I don't follow how this problem currently exists as
>>current code does an unconditional grab of rtnl_lock.
>
>   cpu1                          cpu2
>                                 rtnl_lock()
> switchdev_port_attr_set
>   !rtnl_is_locked() == false
>   switchdev_trans_init
>                                 rtnl_unlock()
>   __switchdev_port_attr_set
>
> now __switchdev_port_attr_set is called without rtnl_lock.

Got it.  Another example of trying to guess context and getting it
wrong.  This is why I like your DEFERRED option so caller can be
explicit.

> Would make sense to introduce rtnl_is_locked_by_me() or something.

Is it sufficient to simply call rtnl_lock() in your deferred context?
You can sleep there and that way there is no question who has the
lock.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Oct. 13, 2015, 2:39 p.m. UTC | #11
Tue, Oct 13, 2015 at 04:07:31PM CEST, sfeldma@gmail.com wrote:
>On Tue, Oct 13, 2015 at 12:30 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Tue, Oct 13, 2015 at 08:53:45AM CEST, sfeldma@gmail.com wrote:
>>>On Mon, Oct 12, 2015 at 11:21 PM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>> Tue, Oct 13, 2015 at 08:03:46AM CEST, john.fastabend@gmail.com wrote:
>>>>>On 15-10-12 10:44 PM, Jiri Pirko wrote:
>>>>>> Tue, Oct 13, 2015 at 04:52:42AM CEST, sfeldma@gmail.com wrote:
>>>>>>> On Mon, Oct 12, 2015 at 11:03 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>>>>>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>>>>>
>>>>>>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>>>>>>> or if he has to defer the att_set processing for later.
>>>>>>>>
>>>>>>>> This also allows drivers to sleep inside attr_set and report operation
>>>>>>>> status back to switchdev core. Switchdev core then warns if status is
>>>>>>>> not ok, instead of silent errors happening in drivers.
>>>>>>>>
>>>>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>>>>> ---
>>>>>>>>  include/net/switchdev.h   |   1 +
>>>>>>>>  net/bridge/br_stp.c       |   3 +-
>>>>>>>>  net/switchdev/switchdev.c | 107 ++++++++++++++++++++++++----------------------
>>>>>>>>  3 files changed, 59 insertions(+), 52 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>>>>>>> index d2879f2..6b109e4 100644
>>>>>>>> --- a/include/net/switchdev.h
>>>>>>>> +++ b/include/net/switchdev.h
>>>>>>>> @@ -17,6 +17,7 @@
>>>>>>>>
>>>>>>>>  #define SWITCHDEV_F_NO_RECURSE         BIT(0)
>>>>>>>>  #define SWITCHDEV_F_SKIP_EOPNOTSUPP    BIT(1)
>>>>>>>> +#define SWITCHDEV_F_DEFER              BIT(2)
>>>>>>>>
>>>>>>>>  struct switchdev_trans_item {
>>>>>>>>         struct list_head list;
>>>>>>>> diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
>>>>>>>> index db6d243de..80c34d7 100644
>>>>>>>> --- a/net/bridge/br_stp.c
>>>>>>>> +++ b/net/bridge/br_stp.c
>>>>>>>> @@ -41,13 +41,14 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
>>>>>>>>  {
>>>>>>>>         struct switchdev_attr attr = {
>>>>>>>>                 .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>>>>>>> +               .flags = SWITCHDEV_F_DEFER,
>>>>>>>>                 .u.stp_state = state,
>>>>>>>>         };
>>>>>>>>         int err;
>>>>>>>>
>>>>>>>>         p->state = state;
>>>>>>>>         err = switchdev_port_attr_set(p->dev, &attr);
>>>>>>>> -       if (err && err != -EOPNOTSUPP)
>>>>>>>> +       if (err)
>>>>>>>
>>>>>>> This looks like a problem as now all other non-switchdev ports will
>>>>>>> get an WARN in the log when STP state changes.  We should only WARN if
>>>>>>> there was an err and the err is not -EOPNOTSUPP.
>>>>>>
>>>>>> If SWITCHDEV_F_DEFER flag is set, there's only 0 of -ENOMEM.
>>>>>>
>>>>>>
>>>>>>>
>>>>>>>>                 br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
>>>>>>>>                                 (unsigned int) p->port_no, p->dev->name);
>>>>>>>>  }
>>>>>>>
>>>>>>> <snip>
>>>>>>>
>>>>>>>>  struct switchdev_attr_set_work {
>>>>>>>>         struct work_struct work;
>>>>>>>>         struct net_device *dev;
>>>>>>>> @@ -183,14 +226,17 @@ static void switchdev_port_attr_set_work(struct work_struct *work)
>>>>>>>>  {
>>>>>>>>         struct switchdev_attr_set_work *asw =
>>>>>>>>                 container_of(work, struct switchdev_attr_set_work, work);
>>>>>>>> +       bool rtnl_locked = rtnl_is_locked();
>>>>>>>>         int err;
>>>>>>>>
>>>>>>>> -       rtnl_lock();
>>>>>>>> -       err = switchdev_port_attr_set(asw->dev, &asw->attr);
>>>>>>>> +       if (!rtnl_locked)
>>>>>>>> +               rtnl_lock();
>>>>>>>
>>>>>>> I'm not following this change.  If someone else has rtnl_lock, we'll
>>>>>>> not wait to grab it here ourselves, and proceed as if we have the
>>>>>>> lock.  But what if that someone else releases the lock in the middle
>>>>>>> of us doing switchdev_port_attr_set_now?  Seems we want to
>>>>>>> unconditionally wait and grab the lock.  We need to block anything
>>>>>>>from moving while we do the attr set.
>>>>>>
>>>>>> Why would someone we call (driver) return the lock? In that case, he is
>>>>>> buggy and should be fixed.
>>>>>>
>>>>>> This hunk only ensures we have rtnl_lock. If not, we take it here. We do
>>>>>> not take it unconditionally because we may already have it, for example
>>>>>> if caller of switchdev_flush_deferred holds rtnl_lock.
>>>>>>
>>>>>
>>>>>This is where you lost me. How do you know another core doesn't happen
>>>>>to have the lock when you hit this code path? Maybe someone is running
>>>>>an ethtool command on another core or something.
>>>>
>>>> You are right. The same problem exists currently in switchdev_port_attr_set.
>>>
>>>You are right as in you'll change this back to unconditional grabbing
>>>of rtnl_lock?  I don't follow how this problem currently exists as
>>>current code does an unconditional grab of rtnl_lock.
>>
>>   cpu1                          cpu2
>>                                 rtnl_lock()
>> switchdev_port_attr_set
>>   !rtnl_is_locked() == false
>>   switchdev_trans_init
>>                                 rtnl_unlock()
>>   __switchdev_port_attr_set
>>
>> now __switchdev_port_attr_set is called without rtnl_lock.
>
>Got it.  Another example of trying to guess context and getting it
>wrong.  This is why I like your DEFERRED option so caller can be
>explicit.
>
>> Would make sense to introduce rtnl_is_locked_by_me() or something.
>
>Is it sufficient to simply call rtnl_lock() in your deferred context?
>You can sleep there and that way there is no question who has the
>lock.

The problem is that caller of flust_deferred may hold the lock already
and then we would deadlock. I'm cooking up v5 with different approach.
Stay tuned.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d2879f2..6b109e4 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -17,6 +17,7 @@ 
 
 #define SWITCHDEV_F_NO_RECURSE		BIT(0)
 #define SWITCHDEV_F_SKIP_EOPNOTSUPP	BIT(1)
+#define SWITCHDEV_F_DEFER		BIT(2)
 
 struct switchdev_trans_item {
 	struct list_head list;
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index db6d243de..80c34d7 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -41,13 +41,14 @@  void br_set_state(struct net_bridge_port *p, unsigned int state)
 {
 	struct switchdev_attr attr = {
 		.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+		.flags = SWITCHDEV_F_DEFER,
 		.u.stp_state = state,
 	};
 	int err;
 
 	p->state = state;
 	err = switchdev_port_attr_set(p->dev, &attr);
-	if (err && err != -EOPNOTSUPP)
+	if (err)
 		br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
 				(unsigned int) p->port_no, p->dev->name);
 }
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index d119e9c..9f94272 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -173,6 +173,49 @@  done:
 	return err;
 }
 
+static int switchdev_port_attr_set_now(struct net_device *dev,
+				       struct switchdev_attr *attr)
+{
+	struct switchdev_trans trans;
+	int err;
+
+	switchdev_trans_init(&trans);
+
+	/* Phase I: prepare for attr set. Driver/device should fail
+	 * here if there are going to be issues in the commit phase,
+	 * such as lack of resources or support.  The driver/device
+	 * should reserve resources needed for the commit phase here,
+	 * but should not commit the attr.
+	 */
+
+	trans.ph_prepare = true;
+	err = __switchdev_port_attr_set(dev, attr, &trans);
+	if (err) {
+		/* Prepare phase failed: abort the transaction.  Any
+		 * resources reserved in the prepare phase are
+		 * released.
+		 */
+
+		if (err != -EOPNOTSUPP)
+			switchdev_trans_items_destroy(&trans);
+
+		return err;
+	}
+
+	/* Phase II: commit attr set.  This cannot fail as a fault
+	 * of driver/device.  If it does, it's a bug in the driver/device
+	 * because the driver said everythings was OK in phase I.
+	 */
+
+	trans.ph_prepare = false;
+	err = __switchdev_port_attr_set(dev, attr, &trans);
+	WARN(err, "%s: Commit of attribute (id=%d) failed.\n",
+	     dev->name, attr->id);
+	switchdev_trans_items_warn_destroy(dev, &trans);
+
+	return err;
+}
+
 struct switchdev_attr_set_work {
 	struct work_struct work;
 	struct net_device *dev;
@@ -183,14 +226,17 @@  static void switchdev_port_attr_set_work(struct work_struct *work)
 {
 	struct switchdev_attr_set_work *asw =
 		container_of(work, struct switchdev_attr_set_work, work);
+	bool rtnl_locked = rtnl_is_locked();
 	int err;
 
-	rtnl_lock();
-	err = switchdev_port_attr_set(asw->dev, &asw->attr);
+	if (!rtnl_locked)
+		rtnl_lock();
+	err = switchdev_port_attr_set_now(asw->dev, &asw->attr);
 	if (err && err != -EOPNOTSUPP)
 		netdev_err(asw->dev, "failed (err=%d) to set attribute (id=%d)\n",
 			   err, asw->attr.id);
-	rtnl_unlock();
+	if (!rtnl_locked)
+		rtnl_unlock();
 
 	dev_put(asw->dev);
 	kfree(work);
@@ -211,7 +257,7 @@  static int switchdev_port_attr_set_defer(struct net_device *dev,
 	asw->dev = dev;
 	memcpy(&asw->attr, attr, sizeof(asw->attr));
 
-	schedule_work(&asw->work);
+	queue_work(switchdev_wq, &asw->work);
 
 	return 0;
 }
@@ -225,57 +271,16 @@  static int switchdev_port_attr_set_defer(struct net_device *dev,
  *	Use a 2-phase prepare-commit transaction model to ensure
  *	system is not left in a partially updated state due to
  *	failure from driver/device.
+ *
+ *	rtnl_lock must be held and must not be in atomic section,
+ *	in case SWITCHDEV_F_DEFER flag is not set.
  */
 int switchdev_port_attr_set(struct net_device *dev, struct switchdev_attr *attr)
 {
-	struct switchdev_trans trans;
-	int err;
-
-	if (!rtnl_is_locked()) {
-		/* Running prepare-commit transaction across stacked
-		 * devices requires nothing moves, so if rtnl_lock is
-		 * not held, schedule a worker thread to hold rtnl_lock
-		 * while setting attr.
-		 */
-
+	if (attr->flags & SWITCHDEV_F_DEFER)
 		return switchdev_port_attr_set_defer(dev, attr);
-	}
-
-	switchdev_trans_init(&trans);
-
-	/* Phase I: prepare for attr set. Driver/device should fail
-	 * here if there are going to be issues in the commit phase,
-	 * such as lack of resources or support.  The driver/device
-	 * should reserve resources needed for the commit phase here,
-	 * but should not commit the attr.
-	 */
-
-	trans.ph_prepare = true;
-	err = __switchdev_port_attr_set(dev, attr, &trans);
-	if (err) {
-		/* Prepare phase failed: abort the transaction.  Any
-		 * resources reserved in the prepare phase are
-		 * released.
-		 */
-
-		if (err != -EOPNOTSUPP)
-			switchdev_trans_items_destroy(&trans);
-
-		return err;
-	}
-
-	/* Phase II: commit attr set.  This cannot fail as a fault
-	 * of driver/device.  If it does, it's a bug in the driver/device
-	 * because the driver said everythings was OK in phase I.
-	 */
-
-	trans.ph_prepare = false;
-	err = __switchdev_port_attr_set(dev, attr, &trans);
-	WARN(err, "%s: Commit of attribute (id=%d) failed.\n",
-	     dev->name, attr->id);
-	switchdev_trans_items_warn_destroy(dev, &trans);
-
-	return err;
+	ASSERT_RTNL();
+	return switchdev_port_attr_set_now(dev, attr);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_attr_set);