diff mbox

[net-next,v3,02/17] net: make vid as a parameter for ndo_fdb_add/ndo_fdb_del

Message ID 1416911328-10979-3-git-send-email-jiri@resnulli.us
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Jiri Pirko Nov. 25, 2014, 10:28 a.m. UTC
Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
u16 vid to drivers from there.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
new in v3
---
 drivers/net/ethernet/intel/i40e/i40e_main.c      |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  4 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  9 +++--
 drivers/net/macvlan.c                            |  4 +-
 drivers/net/vxlan.c                              |  4 +-
 include/linux/netdevice.h                        |  8 ++--
 include/linux/rtnetlink.h                        |  6 ++-
 net/bridge/br_fdb.c                              | 39 ++----------------
 net/bridge/br_private.h                          |  4 +-
 net/core/rtnetlink.c                             | 50 ++++++++++++++++++++----
 10 files changed, 70 insertions(+), 60 deletions(-)

Comments

Andy Gospodarek Nov. 25, 2014, 3:13 p.m. UTC | #1
On Tue, Nov 25, 2014 at 11:28:33AM +0100, Jiri Pirko wrote:
> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
> u16 vid to drivers from there.
> 
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>

Structurally this looks fine, just a misspelling noted below.

Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>

> ---
> new in v3
> ---
>  drivers/net/ethernet/intel/i40e/i40e_main.c      |  2 +-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  4 +-
>  drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  9 +++--
>  drivers/net/macvlan.c                            |  4 +-
>  drivers/net/vxlan.c                              |  4 +-
>  include/linux/netdevice.h                        |  8 ++--
>  include/linux/rtnetlink.h                        |  6 ++-
>  net/bridge/br_fdb.c                              | 39 ++----------------
>  net/bridge/br_private.h                          |  4 +-
>  net/core/rtnetlink.c                             | 50 ++++++++++++++++++++----
>  10 files changed, 70 insertions(+), 60 deletions(-)
> 
[...]
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index a688268..f2a4b38 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -36,6 +36,7 @@
>  #include <linux/mutex.h>
>  #include <linux/if_addr.h>
>  #include <linux/if_bridge.h>
> +#include <linux/if_vlan.h>
>  #include <linux/pci.h>
>  #include <linux/etherdevice.h>
>  
> @@ -2312,7 +2313,7 @@ errout:
>  int ndo_dflt_fdb_add(struct ndmsg *ndm,
>  		     struct nlattr *tb[],
>  		     struct net_device *dev,
> -		     const unsigned char *addr,
> +		     const unsigned char *addr, u16 vid,
>  		     u16 flags)
>  {
>  	int err = -EINVAL;
> @@ -2338,6 +2339,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
>  }
>  EXPORT_SYMBOL(ndo_dflt_fdb_add);
>  
> +static int fbd_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
I presume this is a misspelling?

[...]
> @@ -2370,6 +2394,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
>  
>  	addr = nla_data(tb[NDA_LLADDR]);
>  
> +	err = fbd_vid_parse(tb[NDA_VLAN], &vid);
Same here....

> +	if (err)
> +		return err;
> +
>  	err = -EOPNOTSUPP;
>  
>  	/* Support fdb on master device the net/bridge default case */

[...]
> @@ -2465,6 +2496,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
>  
>  	addr = nla_data(tb[NDA_LLADDR]);
>  
> +	err = fbd_vid_parse(tb[NDA_VLAN], &vid);
...and here.

> +	if (err)
> +		return err;
> +
>  	err = -EOPNOTSUPP;
>  
>  	/* Support fdb on master device the net/bridge default case */
> @@ -2474,7 +2509,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
>  		const struct net_device_ops *ops = br_dev->netdev_ops;
>  
>  		if (ops->ndo_fdb_del)
> -			err = ops->ndo_fdb_del(ndm, tb, dev, addr);
> +			err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
>  
>  		if (err)
>  			goto out;
> @@ -2485,9 +2520,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
>  	/* Embedded bridge, macvlan, and any other device support */
>  	if (ndm->ndm_flags & NTF_SELF) {
>  		if (dev->netdev_ops->ndo_fdb_del)
> -			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
> +			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
> +							   vid);
>  		else
> -			err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
> +			err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
>  
>  		if (!err) {
>  			rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
> -- 
> 1.9.3
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Nov. 25, 2014, 3:18 p.m. UTC | #2
Tue, Nov 25, 2014 at 04:13:12PM CET, gospo@cumulusnetworks.com wrote:
>On Tue, Nov 25, 2014 at 11:28:33AM +0100, Jiri Pirko wrote:
>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>> u16 vid to drivers from there.
>> 
>> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>
>Structurally this looks fine, just a misspelling noted below.
>
>Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
>
>> ---
>> new in v3
>> ---
>>  drivers/net/ethernet/intel/i40e/i40e_main.c      |  2 +-
>>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  4 +-
>>  drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  9 +++--
>>  drivers/net/macvlan.c                            |  4 +-
>>  drivers/net/vxlan.c                              |  4 +-
>>  include/linux/netdevice.h                        |  8 ++--
>>  include/linux/rtnetlink.h                        |  6 ++-
>>  net/bridge/br_fdb.c                              | 39 ++----------------
>>  net/bridge/br_private.h                          |  4 +-
>>  net/core/rtnetlink.c                             | 50 ++++++++++++++++++++----
>>  10 files changed, 70 insertions(+), 60 deletions(-)
>> 
>[...]
>> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
>> index a688268..f2a4b38 100644
>> --- a/net/core/rtnetlink.c
>> +++ b/net/core/rtnetlink.c
>> @@ -36,6 +36,7 @@
>>  #include <linux/mutex.h>
>>  #include <linux/if_addr.h>
>>  #include <linux/if_bridge.h>
>> +#include <linux/if_vlan.h>
>>  #include <linux/pci.h>
>>  #include <linux/etherdevice.h>
>>  
>> @@ -2312,7 +2313,7 @@ errout:
>>  int ndo_dflt_fdb_add(struct ndmsg *ndm,
>>  		     struct nlattr *tb[],
>>  		     struct net_device *dev,
>> -		     const unsigned char *addr,
>> +		     const unsigned char *addr, u16 vid,
>>  		     u16 flags)
>>  {
>>  	int err = -EINVAL;
>> @@ -2338,6 +2339,28 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
>>  }
>>  EXPORT_SYMBOL(ndo_dflt_fdb_add);
>>  
>> +static int fbd_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
>I presume this is a misspelling?

Darn. Thanks. Will fix this.

>
>[...]
>> @@ -2370,6 +2394,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
>>  
>>  	addr = nla_data(tb[NDA_LLADDR]);
>>  
>> +	err = fbd_vid_parse(tb[NDA_VLAN], &vid);
>Same here....
>
>> +	if (err)
>> +		return err;
>> +
>>  	err = -EOPNOTSUPP;
>>  
>>  	/* Support fdb on master device the net/bridge default case */
>
>[...]
>> @@ -2465,6 +2496,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
>>  
>>  	addr = nla_data(tb[NDA_LLADDR]);
>>  
>> +	err = fbd_vid_parse(tb[NDA_VLAN], &vid);
>...and here.
>
>> +	if (err)
>> +		return err;
>> +
>>  	err = -EOPNOTSUPP;
>>  
>>  	/* Support fdb on master device the net/bridge default case */
>> @@ -2474,7 +2509,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
>>  		const struct net_device_ops *ops = br_dev->netdev_ops;
>>  
>>  		if (ops->ndo_fdb_del)
>> -			err = ops->ndo_fdb_del(ndm, tb, dev, addr);
>> +			err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
>>  
>>  		if (err)
>>  			goto out;
>> @@ -2485,9 +2520,10 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
>>  	/* Embedded bridge, macvlan, and any other device support */
>>  	if (ndm->ndm_flags & NTF_SELF) {
>>  		if (dev->netdev_ops->ndo_fdb_del)
>> -			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
>> +			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
>> +							   vid);
>>  		else
>> -			err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
>> +			err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
>>  
>>  		if (!err) {
>>  			rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
>> -- 
>> 1.9.3
>> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 25, 2014, 3:38 p.m. UTC | #3
On 11/25/14 05:28, Jiri Pirko wrote:
> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
> u16 vid to drivers from there.
>

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>

I know this maintains status quo of what is already in the kernel.
But we need to take care of policy (pass it from user space) which
dictates how to proceed on failure. Three possible options:
1) If something fails just continue with the rest of the transaction.
Return success if at least one thing succeeds.
2) If something fails stop transaction and return some partial success code
3) If something fails undo everything that has been done and return failure.

So two bits from somewhere would be useful to send from userspace->kernel


> +static int fbd_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)

typo fbd_vid_parse -> fdb_vid_parse

cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Nov. 25, 2014, 3:43 p.m. UTC | #4
On 11/25/2014 07:18 AM, Jiri Pirko wrote:
> Tue, Nov 25, 2014 at 04:13:12PM CET, gospo@cumulusnetworks.com wrote:
>> On Tue, Nov 25, 2014 at 11:28:33AM +0100, Jiri Pirko wrote:
>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>> u16 vid to drivers from there.
>>>
>>> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>>
>> Structurally this looks fine, just a misspelling noted below.
>>
>> Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
>>

If your going to spin this, should we return an error from
ndo_dflt_fdb_add() when we have a non-zero vid? The dflt
handler uses the dev_(mc|uc)_add_excl routines which will
not consume vids.

If you want to address this with a follow up patch I'm OK
with that. Go ahead and add my ack,

Acked-by: John Fastabend <john.r.fastabend@intel.com>
John Fastabend Nov. 25, 2014, 4:01 p.m. UTC | #5
On 11/25/2014 07:38 AM, Jamal Hadi Salim wrote:
> On 11/25/14 05:28, Jiri Pirko wrote:
>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>> u16 vid to drivers from there.
>>
> 
> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
> 
> I know this maintains status quo of what is already in the kernel.
> But we need to take care of policy (pass it from user space) which
> dictates how to proceed on failure. Three possible options:
> 1) If something fails just continue with the rest of the transaction.
> Return success if at least one thing succeeds.

I'm not sure how (1) works. We can't just let user-space/management
software run along thinking its configuration is set when its
not. At least it doesn't look very appealing for the software I'm
looking at.

> 2) If something fails stop transaction and return some partial success code

Option (2) is the current behavior of fdb this is straight forward
and punts the complexity to user space. And at least the state is
always known.

> 3) If something fails undo everything that has been done and return failure.
> 

Sure this would be nice to have when doing bulk updates and is more
useful on hardware that has a commit phase where updates don't actually
occur until they are committed.

> So two bits from somewhere would be useful to send from userspace->kernel
> 

+1 for a follow up patch though.

> 
>> +static int fbd_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
> 
> typo fbd_vid_parse -> fdb_vid_parse
> 
> cheers,
> jamal
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 25, 2014, 4:18 p.m. UTC | #6
On 11/25/14 11:01, John Fastabend wrote:
> On 11/25/2014 07:38 AM, Jamal Hadi Salim wrote:
>> On 11/25/14 05:28, Jiri Pirko wrote:
>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>> u16 vid to drivers from there.
>>>
>>
>> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>> I know this maintains status quo of what is already in the kernel.
>> But we need to take care of policy (pass it from user space) which
>> dictates how to proceed on failure. Three possible options:
>> 1) If something fails just continue with the rest of the transaction.
>> Return success if at least one thing succeeds.
>
> I'm not sure how (1) works. We can't just let user-space/management
> software run along thinking its configuration is set when its
> not. At least it doesn't look very appealing for the software I'm
> looking at.
>

Thats why it is a policy - just dont use it ;->
IOW, if the user made that choice the consequences are clear i.e there
is no confusion.
Example:
I could add 100 entries and if the 10th one failed for some reason to
apply to software version, I want to continue adding as many as i can
possibly add in the hardware etc.

>> 2) If something fails stop transaction and return some partial success code
>
> Option (2) is the current behavior of fdb this is straight forward
> and punts the complexity to user space. And at least the state is
> always known.
>

I dont think we return "partial" success code, do we?
I think we stop when  software fails and dont care if hardware fails.
So this is status quo - we can do better..

>> 3) If something fails undo everything that has been done and return failure.
>>
>
> Sure this would be nice to have when doing bulk updates and is more
> useful on hardware that has a commit phase where updates don't actually
> occur until they are committed.
>

Indeed - i dont expect this option to be used as much but identifying
as a need now is important.

>> So two bits from somewhere would be useful to send from userspace->kernel
>>
>
> +1 for a follow up patch though.
>

As long as we are not adding any new behavior - agreed.
I dont see us doing that, so no controversy (hence my ACK).

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu Nov. 25, 2014, 4:19 p.m. UTC | #7
On 11/25/14, 7:38 AM, Jamal Hadi Salim wrote:
> On 11/25/14 05:28, Jiri Pirko wrote:
>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>> u16 vid to drivers from there.
>>
>
> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
>
> I know this maintains status quo of what is already in the kernel.
> But we need to take care of policy (pass it from user space) which
> dictates how to proceed on failure. Three possible options:
> 1) If something fails just continue with the rest of the transaction.
> Return success if at least one thing succeeds.
> 2) If something fails stop transaction and return some partial success 
> code
> 3) If something fails undo everything that has been done and return 
> failure.
>
> So two bits from somewhere would be useful to send from userspace->kernel
>
>

ack to what jamal said.  In the model where sw and hw must be in sync, 
we need a mechanism to roll back in this approach.

I like that you are using existing ops.
To avoid the synchronization problem or to make the rollback easier, You 
can still use existing ops and move this into the bridge driver.
ie call ndo_fdb_add/del and ndo_bridge_setlink/ndo_bridge_getlink on the 
bridge port from within the bridge driver.

Again, vote for change ndo_bridge_setlink/ndo_bridge_getlink to be 
renamed to ndo_setlink/getlink for other netdevs. I can submit a patch.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Nov. 25, 2014, 4:30 p.m. UTC | #8
On 11/25/2014 08:18 AM, Jamal Hadi Salim wrote:
> On 11/25/14 11:01, John Fastabend wrote:
>> On 11/25/2014 07:38 AM, Jamal Hadi Salim wrote:
>>> On 11/25/14 05:28, Jiri Pirko wrote:
>>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>>> u16 vid to drivers from there.
>>>>
>>>
>>> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
>>>
>>> I know this maintains status quo of what is already in the kernel.
>>> But we need to take care of policy (pass it from user space) which
>>> dictates how to proceed on failure. Three possible options:
>>> 1) If something fails just continue with the rest of the transaction.
>>> Return success if at least one thing succeeds.
>>
>> I'm not sure how (1) works. We can't just let user-space/management
>> software run along thinking its configuration is set when its
>> not. At least it doesn't look very appealing for the software I'm
>> looking at.
>>
> 
> Thats why it is a policy - just dont use it ;->
> IOW, if the user made that choice the consequences are clear i.e there
> is no confusion.
> Example:
> I could add 100 entries and if the 10th one failed for some reason to
> apply to software version, I want to continue adding as many as i can
> possibly add in the hardware etc.

Actually (after having some coffee) this becomes much more useful
if you return which items failed. Then you can slam the hardware
with your 100 entries, probably a lot more then that, and come back
later and clean it up.

> 
>>> 2) If something fails stop transaction and return some partial success code
>>
>> Option (2) is the current behavior of fdb this is straight forward
>> and punts the complexity to user space. And at least the state is
>> always known.
>>
> 
> I dont think we return "partial" success code, do we?
> I think we stop when  software fails and dont care if hardware fails.
> So this is status quo - we can do better..
> 

We return a bitmask of which operations were successful. So if SW fails
we have both bits cleared and we abort. When SW is successful we set the
SW bit and try to program the HW. If its sucessful we set the HW bit if
its not we abort with an err. Converting this to (1) is not much work
just skip the abort.

>>> 3) If something fails undo everything that has been done and return failure.
>>>
>>
>> Sure this would be nice to have when doing bulk updates and is more
>> useful on hardware that has a commit phase where updates don't actually
>> occur until they are committed.
>>
> 
> Indeed - i dont expect this option to be used as much but identifying
> as a need now is important.
> 
>>> So two bits from somewhere would be useful to send from userspace->kernel
>>>
>>
>> +1 for a follow up patch though.
>>
> 
> As long as we are not adding any new behavior - agreed.
> I dont see us doing that, so no controversy (hence my ACK).
> 
> cheers,
> jamal
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Nov. 25, 2014, 4:33 p.m. UTC | #9
On 11/25/2014 08:19 AM, Roopa Prabhu wrote:
> On 11/25/14, 7:38 AM, Jamal Hadi Salim wrote:
>> On 11/25/14 05:28, Jiri Pirko wrote:
>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>> u16 vid to drivers from there.
>>>
>>
>> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>> I know this maintains status quo of what is already in the kernel.
>> But we need to take care of policy (pass it from user space) which
>> dictates how to proceed on failure. Three possible options:
>> 1) If something fails just continue with the rest of the transaction.
>> Return success if at least one thing succeeds.
>> 2) If something fails stop transaction and return some partial success code
>> 3) If something fails undo everything that has been done and return failure.
>>
>> So two bits from somewhere would be useful to send from userspace->kernel
>>
>>
> 
> ack to what jamal said.  In the model where sw and hw must be in sync, we need a mechanism to roll back in this approach.

I agree its needed but your already out of sync for some period of time
why the software/hardware tables are being programmed. There is no global
sw/hw commit operation.

I'm not sure it matters if the time being out of sync is a touch longer
because we go to user space to fix it. But agreed it can be supported.

> 
> I like that you are using existing ops.
> To avoid the synchronization problem or to make the rollback easier, You can still use existing ops and move this into the bridge driver.
> ie call ndo_fdb_add/del and ndo_bridge_setlink/ndo_bridge_getlink on the bridge port from within the bridge driver.
> 
> Again, vote for change ndo_bridge_setlink/ndo_bridge_getlink to be renamed to ndo_setlink/getlink for other netdevs. I can submit a patch.
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Nov. 25, 2014, 4:38 p.m. UTC | #10
Tue, Nov 25, 2014 at 04:43:13PM CET, john.fastabend@gmail.com wrote:
>On 11/25/2014 07:18 AM, Jiri Pirko wrote:
>>Tue, Nov 25, 2014 at 04:13:12PM CET, gospo@cumulusnetworks.com wrote:
>>>On Tue, Nov 25, 2014 at 11:28:33AM +0100, Jiri Pirko wrote:
>>>>Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>>>u16 vid to drivers from there.
>>>>
>>>>Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>>>
>>>Structurally this looks fine, just a misspelling noted below.
>>>
>>>Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
>>>
>
>If your going to spin this, should we return an error from
>ndo_dflt_fdb_add() when we have a non-zero vid? The dflt
>handler uses the dev_(mc|uc)_add_excl routines which will
>not consume vids.

Hmm. That would break existing scripts blindly setting fdb with vlan.
Not that is makes sense, just that we might not want to break these.

>
>If you want to address this with a follow up patch I'm OK
>with that. Go ahead and add my ack,
>
>Acked-by: John Fastabend <john.r.fastabend@intel.com>
>
>
>-- 
>John Fastabend         Intel Corporation
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Nov. 25, 2014, 4:43 p.m. UTC | #11
Tue, Nov 25, 2014 at 05:19:36PM CET, roopa@cumulusnetworks.com wrote:
>On 11/25/14, 7:38 AM, Jamal Hadi Salim wrote:
>>On 11/25/14 05:28, Jiri Pirko wrote:
>>>Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>>u16 vid to drivers from there.
>>>
>>
>>Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>>I know this maintains status quo of what is already in the kernel.
>>But we need to take care of policy (pass it from user space) which
>>dictates how to proceed on failure. Three possible options:
>>1) If something fails just continue with the rest of the transaction.
>>Return success if at least one thing succeeds.
>>2) If something fails stop transaction and return some partial success code
>>3) If something fails undo everything that has been done and return
>>failure.
>>
>>So two bits from somewhere would be useful to send from userspace->kernel
>>
>>
>
>ack to what jamal said.  In the model where sw and hw must be in sync, we
>need a mechanism to roll back in this approach.
>
>I like that you are using existing ops.
>To avoid the synchronization problem or to make the rollback easier, You can
>still use existing ops and move this into the bridge driver.
>ie call ndo_fdb_add/del and ndo_bridge_setlink/ndo_bridge_getlink on the
>bridge port from within the bridge driver.
>
>Again, vote for change ndo_bridge_setlink/ndo_bridge_getlink to be renamed to
>ndo_setlink/getlink for other netdevs. I can submit a patch.

That is not right I believe. This is for PF_BRIDGE, should have "bridge"
in it because just "setlink/getlink" might be mistaken with similar rtnl
ops.

>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 25, 2014, 4:50 p.m. UTC | #12
On 11/25/14 11:30, John Fastabend wrote:
> On 11/25/2014 08:18 AM, Jamal Hadi Salim wrote:
>> On 11/25/14 11:01, John Fastabend wrote:
>>> On 11/25/2014 07:38 AM, Jamal Hadi Salim wrote:
>>>> On 11/25/14 05:28, Jiri Pirko wrote:
>>>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>>>> u16 vid to drivers from there.
>>>>>
>>>>


> Actually (after having some coffee) this becomes much more useful
> if you return which items failed. Then you can slam the hardware
> with your 100 entries, probably a lot more then that, and come back
> later and clean it up.
>

Yes, that is the general use case.
Unfortunately at the moment we only return codes on a netlink set
direction - but would be a beauty if we could return what succeeded
and didnt in some form of vector.
Note: all is not lost because you can always do a get afterwards and
find what is missing if you got a return code of "partial success".
Just a little less efficient..


> We return a bitmask of which operations were successful. So if SW fails
> we have both bits cleared and we abort. When SW is successful we set the
> SW bit and try to program the HW. If its sucessful we set the HW bit if
> its not we abort with an err. Converting this to (1) is not much work
> just skip the abort.
>

Ok, guess i am gonna have to go stare at the code some more.
I thought we returned one of the error codes?
A bitmask would work for a single entry - because you have two
options add to h/ware and/or s/ware. So response is easy to encode.
But if i have 1000 and they are sparsely populated (think an indexed
table and i have indices 1, 23, 45, etc), then a bitmask would be
hard to use.

cheers,
jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 25, 2014, 4:57 p.m. UTC | #13
On 11/25/14 11:33, John Fastabend wrote:
> On 11/25/2014 08:19 AM, Roopa Prabhu wrote:

> I agree its needed but your already out of sync for some period of time
> why the software/hardware tables are being programmed. There is no global
> sw/hw commit operation.
>
> I'm not sure it matters if the time being out of sync is a touch longer
> because we go to user space to fix it. But agreed it can be supported.
>


Recent netfilter has 2 phase commit built in. Maybe we can generalize
that?
Note, there are use cases where it is important to do rollbacks.
If i am doing a distributed router, then to make sure a FIB/NH entries
are properly synced in across the cluster is extremely important. You
cant justify letting a few packets sneak in the wrong path.
in other words, strong consistency is important.
Having said that things can be worked around (and i would not use
2pc for the example use case i gave); however, that doesnt negate
the fact we need it.

cheers,
jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Samudrala, Sridhar Nov. 25, 2014, 6:53 p.m. UTC | #14
On 11/25/2014 2:28 AM, Jiri Pirko wrote:
> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
> u16 vid to drivers from there.
>
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> ---
> new in v3
> ---
>   drivers/net/ethernet/intel/i40e/i40e_main.c      |  2 +-
>   drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  4 +-
>   drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  9 +++--
>   drivers/net/macvlan.c                            |  4 +-
>   drivers/net/vxlan.c                              |  4 +-
>   include/linux/netdevice.h                        |  8 ++--
>   include/linux/rtnetlink.h                        |  6 ++-
>   net/bridge/br_fdb.c                              | 39 ++----------------
>   net/bridge/br_private.h                          |  4 +-
>   net/core/rtnetlink.c                             | 50 ++++++++++++++++++++----
>   10 files changed, 70 insertions(+), 60 deletions(-)
>
<deleted>
>   
> +static int fbd_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)

looks like a typo? fdb_vid_parse()


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Nov. 25, 2014, 8:40 p.m. UTC | #15
Tue, Nov 25, 2014 at 07:53:17PM CET, sridhar.samudrala@intel.com wrote:
>
>On 11/25/2014 2:28 AM, Jiri Pirko wrote:
>>Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>u16 vid to drivers from there.
>>
>>Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>>---
>>new in v3
>>---
>>  drivers/net/ethernet/intel/i40e/i40e_main.c      |  2 +-
>>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  4 +-
>>  drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |  9 +++--
>>  drivers/net/macvlan.c                            |  4 +-
>>  drivers/net/vxlan.c                              |  4 +-
>>  include/linux/netdevice.h                        |  8 ++--
>>  include/linux/rtnetlink.h                        |  6 ++-
>>  net/bridge/br_fdb.c                              | 39 ++----------------
>>  net/bridge/br_private.h                          |  4 +-
>>  net/core/rtnetlink.c                             | 50 ++++++++++++++++++++----
>>  10 files changed, 70 insertions(+), 60 deletions(-)
>>
><deleted>
>>+static int fbd_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
>
>looks like a typo? fdb_vid_parse()

Already fixed. You are actually the third person pointing at this :)

>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Graf Nov. 25, 2014, 10:14 p.m. UTC | #16
On 11/25/14 at 11:28am, Jiri Pirko wrote:
> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
> u16 vid to drivers from there.
> 
> Signed-off-by: Jiri Pirko <jiri@resnulli.us>

I'm slightly confused ;-)

We both argued that parsing Netlink attributes in the drivers is wrong.
What happened to the plan of renaming ndo_fdb_ to ndo_neigh_ and
introducing a non-Netlink in-kernel API for advanced usage by swdev?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Fainelli Nov. 25, 2014, 10:39 p.m. UTC | #17
On 25/11/14 14:14, Thomas Graf wrote:
> On 11/25/14 at 11:28am, Jiri Pirko wrote:
>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>> u16 vid to drivers from there.
>>
>> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> 
> I'm slightly confused ;-)
> 
> We both argued that parsing Netlink attributes in the drivers is wrong.
> What happened to the plan of renaming ndo_fdb_ to ndo_neigh_ and
> introducing a non-Netlink in-kernel API for advanced usage by swdev?
> 

Not sure I follow you here, the commit message says what it says it
does, are we looking at the same patch?
--
Florian
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Graf Nov. 25, 2014, 11:11 p.m. UTC | #18
On 11/25/14 at 02:39pm, Florian Fainelli wrote:
> On 25/11/14 14:14, Thomas Graf wrote:
> > On 11/25/14 at 11:28am, Jiri Pirko wrote:
> >> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
> >> u16 vid to drivers from there.
> >>
> >> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
> > 
> > I'm slightly confused ;-)
> > 
> > We both argued that parsing Netlink attributes in the drivers is wrong.
> > What happened to the plan of renaming ndo_fdb_ to ndo_neigh_ and
> > introducing a non-Netlink in-kernel API for advanced usage by swdev?
> > 
> 
> Not sure I follow you here, the commit message says what it says it
> does, are we looking at the same patch?

I'm referring to the discussion that occured on patch 06/10 of v2:

http://www.spinics.net/lists/netdev/msg303637.html
http://www.spinics.net/lists/netdev/msg303669.html
http://www.spinics.net/lists/netdev/msg303689.html
http://www.spinics.net/lists/netdev/msg303706.html

I won't hold up this series though. Glad to do the work afterwards.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Simon Horman Nov. 26, 2014, 1:44 a.m. UTC | #19
On Tue, Nov 25, 2014 at 11:50:27AM -0500, Jamal Hadi Salim wrote:
> On 11/25/14 11:30, John Fastabend wrote:
> >On 11/25/2014 08:18 AM, Jamal Hadi Salim wrote:
> >>On 11/25/14 11:01, John Fastabend wrote:
> >>>On 11/25/2014 07:38 AM, Jamal Hadi Salim wrote:
> >>>>On 11/25/14 05:28, Jiri Pirko wrote:
> >>>>>Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
> >>>>>u16 vid to drivers from there.
> >>>>>
> >>>>
> 
> 
> >Actually (after having some coffee) this becomes much more useful
> >if you return which items failed. Then you can slam the hardware
> >with your 100 entries, probably a lot more then that, and come back
> >later and clean it up.
> >
> 
> Yes, that is the general use case.
> Unfortunately at the moment we only return codes on a netlink set
> direction - but would be a beauty if we could return what succeeded
> and didnt in some form of vector.
> Note: all is not lost because you can always do a get afterwards and
> find what is missing if you got a return code of "partial success".
> Just a little less efficient..

I agree entirely. But efficiency may be a very real issue in practice.

> >We return a bitmask of which operations were successful. So if SW fails
> >we have both bits cleared and we abort. When SW is successful we set the
> >SW bit and try to program the HW. If its sucessful we set the HW bit if
> >its not we abort with an err. Converting this to (1) is not much work
> >just skip the abort.
> >
> 
> Ok, guess i am gonna have to go stare at the code some more.
> I thought we returned one of the error codes?
> A bitmask would work for a single entry - because you have two
> options add to h/ware and/or s/ware. So response is easy to encode.
> But if i have 1000 and they are sparsely populated (think an indexed
> table and i have indices 1, 23, 45, etc), then a bitmask would be
> hard to use.
> 
> cheers,
> jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman Nov. 26, 2014, 2:36 a.m. UTC | #20
On Tue, Nov 25, 2014 at 6:50 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> On 11/25/14 11:30, John Fastabend wrote:
>>
>> On 11/25/2014 08:18 AM, Jamal Hadi Salim wrote:
>>>
>>> On 11/25/14 11:01, John Fastabend wrote:
>>>>
>>>> On 11/25/2014 07:38 AM, Jamal Hadi Salim wrote:
>>>>>
>>>>> On 11/25/14 05:28, Jiri Pirko wrote:
>>>>>>
>>>>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass
>>>>>> simple
>>>>>> u16 vid to drivers from there.
>>>>>>
>>>>>
>
>
>> Actually (after having some coffee) this becomes much more useful
>> if you return which items failed. Then you can slam the hardware
>> with your 100 entries, probably a lot more then that, and come back
>> later and clean it up.
>>
>
> Yes, that is the general use case.
> Unfortunately at the moment we only return codes on a netlink set
> direction - but would be a beauty if we could return what succeeded
> and didnt in some form of vector.
> Note: all is not lost because you can always do a get afterwards and
> find what is missing if you got a return code of "partial success".
> Just a little less efficient..
>
>
>> We return a bitmask of which operations were successful. So if SW fails
>> we have both bits cleared and we abort. When SW is successful we set the
>> SW bit and try to program the HW. If its sucessful we set the HW bit if
>> its not we abort with an err. Converting this to (1) is not much work
>> just skip the abort.
>>
>
> Ok, guess i am gonna have to go stare at the code some more.
> I thought we returned one of the error codes?
> A bitmask would work for a single entry - because you have two
> options add to h/ware and/or s/ware. So response is easy to encode.
> But if i have 1000 and they are sparsely populated (think an indexed
> table and i have indices 1, 23, 45, etc), then a bitmask would be
> hard to use.

I'm confused by this discussion.  Do I have this right: You want to
send 1000 RTM_NEWNEIGHs to PF_BRIDGE with both NTF_MASTER and NTF_SELF
set such that 1000 new FBD entries are installed in both (SW) the
bridge's FDB and (HW) the port driver's FDB.  My first confusion is
why do you want these FBD entries in bridge's FDB?  We're offloading
the switching to HW so HW should be handling fwd plane.  If ctrl pkt
make it to SW, it can learn those FDB entries; no need for manual
install of FDB entry in SW.  It seems to me you only want to use
NTF_SELF to install the FDB entry in HW using the port driver.  And an
error code is returned for that install.  Since there is only one
target (NTF_SELF) there is no need for bitmask return.

> cheers,
> jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 26, 2014, 3:19 a.m. UTC | #21
On 11/25/14 21:36, Scott Feldman wrote:
> On Tue, Nov 25, 2014 at 6:50 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>> On 11/25/14 11:30, John Fastabend wrote:

>> Ok, guess i am gonna have to go stare at the code some more.
>> I thought we returned one of the error codes?
>> A bitmask would work for a single entry - because you have two
>> options add to h/ware and/or s/ware. So response is easy to encode.
>> But if i have 1000 and they are sparsely populated (think an indexed
>> table and i have indices 1, 23, 45, etc), then a bitmask would be
>> hard to use.
>
> I'm confused by this discussion.

This is about the policy which states "install as many as you can, dont
worry about failures". In such a case, how do you tell user space back
"oh, btw you know your request #1, #23, and 45 went ok, but nothing else
worked". A simple return code wont work. You could return a code to
say "some worked". At which case user space could dump and find out only
#1, #23 and #45 worked.

Your question below is a different context; Some people may want
the policy where whats in hardware
a) gets to be seen in software and b) allow for destination lookup
failures in hardware to show up in the kernel, refresh the fdb in the
kernel via learning
and c) whats in s.ware gets synced to hardware just because there's
space in the hardware
I dont want any of the above;-> Which would work if we had policy knobs.
Learning, flooding, sync from hardware. Speaking of the last one:
Where is my cookie Scott? I want my cookie.

cheers,
jamal


> Do I have this right: You want to
> send 1000 RTM_NEWNEIGHs to PF_BRIDGE with both NTF_MASTER and NTF_SELF
> set such that 1000 new FBD entries are installed in both (SW) the
> bridge's FDB and (HW) the port driver's FDB.  My first confusion is
> why do you want these FBD entries in bridge's FDB?  We're offloading
> the switching to HW so HW should be handling fwd plane.  If ctrl pkt
> make it to SW, it can learn those FDB entries;
> no need for manual
> install of FDB entry in SW.  It seems to me you only want to use
> NTF_SELF to install the FDB entry in HW using the port driver.  And an
> error code is returned for that install.  Since there is only one
> target (NTF_SELF) there is no need for bitmask return.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman Nov. 26, 2014, 3:59 a.m. UTC | #22
On Tue, Nov 25, 2014 at 5:19 PM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> On 11/25/14 21:36, Scott Feldman wrote:
>>
>> On Tue, Nov 25, 2014 at 6:50 AM, Jamal Hadi Salim <jhs@mojatatu.com>
>> wrote:
>>>
>>> On 11/25/14 11:30, John Fastabend wrote:
>
>
>>> Ok, guess i am gonna have to go stare at the code some more.
>>> I thought we returned one of the error codes?
>>> A bitmask would work for a single entry - because you have two
>>> options add to h/ware and/or s/ware. So response is easy to encode.
>>> But if i have 1000 and they are sparsely populated (think an indexed
>>> table and i have indices 1, 23, 45, etc), then a bitmask would be
>>> hard to use.
>>
>>
>> I'm confused by this discussion.
>
>
> This is about the policy which states "install as many as you can, dont
> worry about failures". In such a case, how do you tell user space back
> "oh, btw you know your request #1, #23, and 45 went ok, but nothing else
> worked". A simple return code wont work. You could return a code to
> say "some worked". At which case user space could dump and find out only
> #1, #23 and #45 worked.

You request for what?  That's my confusion.  Are you trying to install
FDB entry into both SW and HW at same time?  And then do a bunch in a
batch?  I'm saying use MASTER for SW and SELF for HW in two steps, if
you want FDB entry installed in both Sw and HW.  Check your return
code each step.  Batch all to HW first, then batch all that PASSED to
SW.  I don't even know really why you're trying to install to both HW
and SW.  Install it to HW and be done. fdb_dump will set HW entries
via SELF.

> Your question below is a different context; Some people may want
> the policy where whats in hardware
> a) gets to be seen in software and b) allow for destination lookup
> failures in hardware to show up in the kernel, refresh the fdb in the
> kernel via learning
> and c) whats in s.ware gets synced to hardware just because there's
> space in the hardware
> I dont want any of the above;-> Which would work if we had policy knobs.
> Learning, flooding, sync from hardware. Speaking of the last one:
> Where is my cookie Scott? I want my cookie.

Ah, Jamal, look again at patches 13-17/17 in last v3 set.  That was a
big steaming snickerdoodle just for you!  Now you can push policy
knobs down to port driver and or bridge to fine tune what ever you
want.  You'll find knobs for learning, flooding, learning sync to hw,
etc.  I thought you even ACKed some of these.  a) above knob is 14/17
patch, b) above is using existing learning knob on bridge, c) above I
don't get...no point in syncing that direction.

> cheers,
> jamal
>
>
>
>> Do I have this right: You want to
>> send 1000 RTM_NEWNEIGHs to PF_BRIDGE with both NTF_MASTER and NTF_SELF
>> set such that 1000 new FBD entries are installed in both (SW) the
>> bridge's FDB and (HW) the port driver's FDB.  My first confusion is
>> why do you want these FBD entries in bridge's FDB?  We're offloading
>> the switching to HW so HW should be handling fwd plane.  If ctrl pkt
>> make it to SW, it can learn those FDB entries;
>> no need for manual
>> install of FDB entry in SW.  It seems to me you only want to use
>> NTF_SELF to install the FDB entry in HW using the port driver.  And an
>> error code is returned for that install.  Since there is only one
>> target (NTF_SELF) there is no need for bitmask return.
>
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Nov. 26, 2014, 7:54 a.m. UTC | #23
Tue, Nov 25, 2014 at 11:14:17PM CET, tgraf@suug.ch wrote:
>On 11/25/14 at 11:28am, Jiri Pirko wrote:
>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>> u16 vid to drivers from there.
>> 
>> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>
>I'm slightly confused ;-)
>
>We both argued that parsing Netlink attributes in the drivers is wrong.
>What happened to the plan of renaming ndo_fdb_ to ndo_neigh_ and
>introducing a non-Netlink in-kernel API for advanced usage by swdev?

Well the thing is that at the moment, it is not needed to call ndo_fdb_*
from inside the kernel. So the whole plan does not have to happen now.
Plus I saw the patches (Scott sent me) and I believe that they are
unnecessary complex. We can do this later if needed. Now, I would like
to stick with what we have so far, keep things simple.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 26, 2014, 11:28 a.m. UTC | #24
On 11/25/14 22:59, Scott Feldman wrote:
> On Tue, Nov 25, 2014 at 5:19 PM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>> On 11/25/14 21:36, Scott Feldman wrote:

>>
>>>> Ok, guess i am gonna have to go stare at the code some more.
>>>> I thought we returned one of the error codes?
>>>> A bitmask would work for a single entry - because you have two
>>>> options add to h/ware and/or s/ware. So response is easy to encode.
>>>> But if i have 1000 and they are sparsely populated (think an indexed
>>>> table and i have indices 1, 23, 45, etc), then a bitmask would be
>>>> hard to use.
>>>
>>>
>>> I'm confused by this discussion.
>>
>>
>> This is about the policy which states "install as many as you can, dont
>> worry about failures". In such a case, how do you tell user space back
>> "oh, btw you know your request #1, #23, and 45 went ok, but nothing else
>> worked". A simple return code wont work. You could return a code to
>> say "some worked". At which case user space could dump and find out only
>> #1, #23 and #45 worked.
>
> You request for what?  That's my confusion.

Scott, you are gonna make do this all over again?;->
The summary is there are three possible policies that could be
identified by the user asking for a kernel operation.
One use case example was to send a bunch of (for example)
create/updates and request that the kernel should not abort on a
failure of a single one but to keep going and create/update as many
as possible. Is that part clear? I know it is not what you do,
but there are use cases for that (Read John's response).
Now assuming someone wants this and some entries failed;
how do you tell user space back what was actually updated vs not?
You could return a code which says "partial success".
Forget whether the table is keyed or indexed but if you wanted
to return more detailed info you would return an array/vector of some
sort with status code per entry. Something netlink cant do.
Is that a better description?

> Are you trying to install
> FDB entry into both SW and HW at same time?


What is wrong with installing on both hardware and software? The
point was to identify what kind of policies could be requested by
the user; but even for the bridge why is it bad that i ask for
both master&self?
It is something I can do today with none of these patches.

> And then do a bunch in a
> batch?  I'm saying use MASTER for SW and SELF for HW in two steps,

But that would be enforcing your policy on me.

> if
> you want FDB entry installed in both Sw and HW.  Check your return
> code each step.  Batch all to HW first, then batch all that PASSED to
> SW.  I don't even know really why you're trying to install to both HW
> and SW.  Install it to HW and be done. fdb_dump will set HW entries
> via SELF.
>

First off: bad performance, but your call to do it that way
(just please please dont enforce it on me;->)

Lets take the hardware batching you mentioned above and see if
i can help to clarify in the third policy choice (continue-on-failure).
Lets say you have a keyed table such as the fdb table is.
You send 10 entries to be created/added in hardware. #3 and #5 failed
because you made a mistake and sent them with the same key. #9 and #10
failed because the hardware doesnt have any more space.
we didnt stop and go back for #3 and #5 because the user told
us to continue and do the rest when we fail. And s/he did that because
she wanted to put as many entries in hardware as possible without
necessarily needing to know how much space exists.


> Ah, Jamal, look again at patches 13-17/17 in last v3 set.  That was a
> big steaming snickerdoodle just for you!  Now you can push policy
> knobs down to port driver and or bridge to fine tune what ever you
> want.  You'll find knobs for learning, flooding, learning sync to hw,
> etc.  I thought you even ACKed some of these.

I think it almost there.
What you are missing is the policy decision to only sync when i
say so. Having an ndo_ops is a necessity but i dont want the driver
to decide for me just because it can ;->
Telling hardware to learn is instructing it to self update its entries
based on source lookup failure. That is distinctly different from
telling to sync to the kernel. So if you add that knob we are in good
shape.

cheers,
jamal

> a) above knob is 14/17
> patch, b) above is using existing learning knob on bridge, c) above I
> don't get...no point in syncing that direction.
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Nov. 26, 2014, 11:40 a.m. UTC | #25
Wed, Nov 26, 2014 at 12:28:18PM CET, jhs@mojatatu.com wrote:
>On 11/25/14 22:59, Scott Feldman wrote:
>>On Tue, Nov 25, 2014 at 5:19 PM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>>>On 11/25/14 21:36, Scott Feldman wrote:
>
>>>
>>>>>Ok, guess i am gonna have to go stare at the code some more.
>>>>>I thought we returned one of the error codes?
>>>>>A bitmask would work for a single entry - because you have two
>>>>>options add to h/ware and/or s/ware. So response is easy to encode.
>>>>>But if i have 1000 and they are sparsely populated (think an indexed
>>>>>table and i have indices 1, 23, 45, etc), then a bitmask would be
>>>>>hard to use.
>>>>
>>>>
>>>>I'm confused by this discussion.
>>>
>>>
>>>This is about the policy which states "install as many as you can, dont
>>>worry about failures". In such a case, how do you tell user space back
>>>"oh, btw you know your request #1, #23, and 45 went ok, but nothing else
>>>worked". A simple return code wont work. You could return a code to
>>>say "some worked". At which case user space could dump and find out only
>>>#1, #23 and #45 worked.
>>
>>You request for what?  That's my confusion.
>
>Scott, you are gonna make do this all over again?;->
>The summary is there are three possible policies that could be
>identified by the user asking for a kernel operation.
>One use case example was to send a bunch of (for example)
>create/updates and request that the kernel should not abort on a
>failure of a single one but to keep going and create/update as many
>as possible. Is that part clear? I know it is not what you do,
>but there are use cases for that (Read John's response).
>Now assuming someone wants this and some entries failed;
>how do you tell user space back what was actually updated vs not?
>You could return a code which says "partial success".
>Forget whether the table is keyed or indexed but if you wanted
>to return more detailed info you would return an array/vector of some
>sort with status code per entry. Something netlink cant do.
>Is that a better description?

Sure this is something that is reasonable to request. But that would
require a major changes to userspace api. At this moment, when we are
using the existing api, I would leave this out for phase 1. Let this be
resolved later as a separate work. Does that make sense?


>
>>Are you trying to install
>>FDB entry into both SW and HW at same time?
>
>
>What is wrong with installing on both hardware and software? The
>point was to identify what kind of policies could be requested by
>the user; but even for the bridge why is it bad that i ask for
>both master&self?
>It is something I can do today with none of these patches.
>
>>And then do a bunch in a
>>batch?  I'm saying use MASTER for SW and SELF for HW in two steps,
>
>But that would be enforcing your policy on me.
>
>>if
>>you want FDB entry installed in both Sw and HW.  Check your return
>>code each step.  Batch all to HW first, then batch all that PASSED to
>>SW.  I don't even know really why you're trying to install to both HW
>>and SW.  Install it to HW and be done. fdb_dump will set HW entries
>>via SELF.
>>
>
>First off: bad performance, but your call to do it that way
>(just please please dont enforce it on me;->)
>
>Lets take the hardware batching you mentioned above and see if
>i can help to clarify in the third policy choice (continue-on-failure).
>Lets say you have a keyed table such as the fdb table is.
>You send 10 entries to be created/added in hardware. #3 and #5 failed
>because you made a mistake and sent them with the same key. #9 and #10
>failed because the hardware doesnt have any more space.
>we didnt stop and go back for #3 and #5 because the user told
>us to continue and do the rest when we fail. And s/he did that because
>she wanted to put as many entries in hardware as possible without
>necessarily needing to know how much space exists.
>
>
>>Ah, Jamal, look again at patches 13-17/17 in last v3 set.  That was a
>>big steaming snickerdoodle just for you!  Now you can push policy
>>knobs down to port driver and or bridge to fine tune what ever you
>>want.  You'll find knobs for learning, flooding, learning sync to hw,
>>etc.  I thought you even ACKed some of these.
>
>I think it almost there.
>What you are missing is the policy decision to only sync when i
>say so. Having an ndo_ops is a necessity but i dont want the driver
>to decide for me just because it can ;->
>Telling hardware to learn is instructing it to self update its entries
>based on source lookup failure. That is distinctly different from
>telling to sync to the kernel. So if you add that knob we are in good
>shape.
>
>cheers,
>jamal
>
>>a) above knob is 14/17
>>patch, b) above is using existing learning knob on bridge, c) above I
>>don't get...no point in syncing that direction.
>>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 26, 2014, 11:54 a.m. UTC | #26
On 11/26/14 06:40, Jiri Pirko wrote:
> Wed, Nov 26, 2014 at 12:28:18PM CET, jhs@mojatatu.com wrote:

>> Scott, you are gonna make do this all over again?;->
>> The summary is there are three possible policies that could be
>> identified by the user asking for a kernel operation.
>> One use case example was to send a bunch of (for example)
>> create/updates and request that the kernel should not abort on a
>> failure of a single one but to keep going and create/update as many
>> as possible. Is that part clear? I know it is not what you do,
>> but there are use cases for that (Read John's response).
>> Now assuming someone wants this and some entries failed;
>> how do you tell user space back what was actually updated vs not?
>> You could return a code which says "partial success".
>> Forget whether the table is keyed or indexed but if you wanted
>> to return more detailed info you would return an array/vector of some
>> sort with status code per entry. Something netlink cant do.
>> Is that a better description?
>
> Sure this is something that is reasonable to request. But that would
> require a major changes to userspace api. At this moment, when we are
> using the existing api, I would leave this out for phase 1. Let this be
> resolved later as a separate work. Does that make sense?
>

I think these are just discussions so we know where we are going.
I ACKed the patch already but added that we should consider these
policies. Scott take note.

The default behavior should be maintained whatever the new policies are.
The vectoring is going to be a harder thing to get right. It can be done
but long shot probably.
For user->kernel policy description, that is easy; we need 2 bits
from somewhere; probably same namespace as software/hardware.


cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 26, 2014, 12:06 p.m. UTC | #27
On 11/26/14 06:54, Jamal Hadi Salim wrote:
> On 11/26/14 06:40, Jiri Pirko wrote:

>> Sure this is something that is reasonable to request. But that would
>> require a major changes to userspace api. At this moment, when we are
>> using the existing api, I would leave this out for phase 1. Let this be
>> resolved later as a separate work. Does that make sense?
>>
>
> I think these are just discussions so we know where we are going.
> I ACKed the patch already but added that we should consider these
> policies. Scott take note.
>

In case i wasnt clear - yes, the patch as is fine ;->

cheers,
jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman Nov. 27, 2014, 6:50 a.m. UTC | #28
On Wed, Nov 26, 2014 at 1:28 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> On 11/25/14 22:59, Scott Feldman wrote:
>>
>> On Tue, Nov 25, 2014 at 5:19 PM, Jamal Hadi Salim <jhs@mojatatu.com>
>> wrote:
>>>
>>> On 11/25/14 21:36, Scott Feldman wrote:
>
>
>>>
>>>>> Ok, guess i am gonna have to go stare at the code some more.
>>>>> I thought we returned one of the error codes?
>>>>> A bitmask would work for a single entry - because you have two
>>>>> options add to h/ware and/or s/ware. So response is easy to encode.
>>>>> But if i have 1000 and they are sparsely populated (think an indexed
>>>>> table and i have indices 1, 23, 45, etc), then a bitmask would be
>>>>> hard to use.
>>>>
>>>>
>>>>
>>>> I'm confused by this discussion.
>>>
>>>
>>>
>>> This is about the policy which states "install as many as you can, dont
>>> worry about failures". In such a case, how do you tell user space back
>>> "oh, btw you know your request #1, #23, and 45 went ok, but nothing else
>>> worked". A simple return code wont work. You could return a code to
>>> say "some worked". At which case user space could dump and find out only
>>> #1, #23 and #45 worked.
>>
>>
>> You request for what?  That's my confusion.
>
>
> Scott, you are gonna make do this all over again?;->
> The summary is there are three possible policies that could be
> identified by the user asking for a kernel operation.
> One use case example was to send a bunch of (for example)
> create/updates and request that the kernel should not abort on a
> failure of a single one but to keep going and create/update as many
> as possible. Is that part clear? I know it is not what you do,
> but there are use cases for that (Read John's response).
> Now assuming someone wants this and some entries failed;
> how do you tell user space back what was actually updated vs not?
> You could return a code which says "partial success".
> Forget whether the table is keyed or indexed but if you wanted
> to return more detailed info you would return an array/vector of some
> sort with status code per entry. Something netlink cant do.
> Is that a better description?
>
>> Are you trying to install
>> FDB entry into both SW and HW at same time?
>
>
>
> What is wrong with installing on both hardware and software? The
> point was to identify what kind of policies could be requested by
> the user; but even for the bridge why is it bad that i ask for
> both master&self?
> It is something I can do today with none of these patches.
>
>> And then do a bunch in a
>> batch?  I'm saying use MASTER for SW and SELF for HW in two steps,
>
>
> But that would be enforcing your policy on me.

Ok, I get it now.  I'm looking forward to see what solution people
come up with to solve this.

>
>> if
>> you want FDB entry installed in both Sw and HW.  Check your return
>> code each step.  Batch all to HW first, then batch all that PASSED to
>> SW.  I don't even know really why you're trying to install to both HW
>> and SW.  Install it to HW and be done. fdb_dump will set HW entries
>> via SELF.
>>
>
> First off: bad performance, but your call to do it that way
> (just please please dont enforce it on me;->)
>
> Lets take the hardware batching you mentioned above and see if
> i can help to clarify in the third policy choice (continue-on-failure).
> Lets say you have a keyed table such as the fdb table is.
> You send 10 entries to be created/added in hardware. #3 and #5 failed
> because you made a mistake and sent them with the same key. #9 and #10
> failed because the hardware doesnt have any more space.
> we didnt stop and go back for #3 and #5 because the user told
> us to continue and do the rest when we fail. And s/he did that because
> she wanted to put as many entries in hardware as possible without
> necessarily needing to know how much space exists.
>
>
>> Ah, Jamal, look again at patches 13-17/17 in last v3 set.  That was a
>> big steaming snickerdoodle just for you!  Now you can push policy
>> knobs down to port driver and or bridge to fine tune what ever you
>> want.  You'll find knobs for learning, flooding, learning sync to hw,
>> etc.  I thought you even ACKed some of these.
>
>
> I think it almost there.
> What you are missing is the policy decision to only sync when i
> say so. Having an ndo_ops is a necessity but i dont want the driver
> to decide for me just because it can ;->
> Telling hardware to learn is instructing it to self update its entries
> based on source lookup failure. That is distinctly different from
> telling to sync to the kernel. So if you add that knob we are in good
> shape.

It's there: IFLA_BRPORT_LEARNING_SYNC.  From iproute2:

$ bridge -d link show dev swp1
2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
master br0 state forwarding priority 32 cost 2
    hairpin off guard off root_block off fastleave off learning off flood off
2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master br0
    learning on learning_sync on hwmode swdev

Turn it off:

$ bridge link set dev swp1 hwmode swdev learning_sync off

And now:

$ bridge -d link show dev swp1
2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
master br0 state forwarding priority 32 cost 2
    hairpin off guard off root_block off fastleave off learning off flood off
2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master br0
    learning on learning_sync off hwmode swdev


> cheers,
> jamal
>
>
>> a) above knob is 14/17
>> patch, b) above is using existing learning knob on bridge, c) above I
>> don't get...no point in syncing that direction.
>>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 27, 2014, 12:14 p.m. UTC | #29
On 11/27/14 01:50, Scott Feldman wrote:

[..]

>
> It's there: IFLA_BRPORT_LEARNING_SYNC.  From iproute2:
>
> $ bridge -d link show dev swp1
> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
> master br0 state forwarding priority 32 cost 2
>      hairpin off guard off root_block off fastleave off learning off flood off
> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master br0
>      learning on learning_sync on hwmode swdev
>
> Turn it off:
>
> $ bridge link set dev swp1 hwmode swdev learning_sync off
>
> And now:
>
> $ bridge -d link show dev swp1
> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
> master br0 state forwarding priority 32 cost 2
>      hairpin off guard off root_block off fastleave off learning off flood off
> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master br0
>      learning on learning_sync off hwmode swdev
>
>

Yes, this is the nice control portion.
 From reviewing the patches, I didnt see how the core to the driver was
using the  learning_sync. IOW, how do i turn off the drivers sync
from being activated? Maybe you are doing this in the rocker patches
which i didnt review? i think this needs to be core infrastructure i.e
if you are doing this in a timer (as opposed to interrupt driven), then
the core sync timer would kick in and call some driver ops.
In any case, details that can be ironed out later..

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman Nov. 27, 2014, 8:59 p.m. UTC | #30
Ya right now the driver just doesn't call br_fdb_external_learn_add()
if LEARNING_SYNC is not set.  It's a port driver setting so it seems
fine to handle it in the port driver.  You could move the check up to
br_fdb_external_learn_add(), but then you have an extra call every 1s
for each fdb entry being refreshed.  (1s or whatever the refresh
frequency is).  Easier to avoid this overhead and make the decision at
the source.

-scott

On Thu, Nov 27, 2014 at 2:14 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> On 11/27/14 01:50, Scott Feldman wrote:
>
> [..]
>
>>
>> It's there: IFLA_BRPORT_LEARNING_SYNC.  From iproute2:
>>
>> $ bridge -d link show dev swp1
>> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
>> master br0 state forwarding priority 32 cost 2
>>      hairpin off guard off root_block off fastleave off learning off flood
>> off
>> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master
>> br0
>>      learning on learning_sync on hwmode swdev
>>
>> Turn it off:
>>
>> $ bridge link set dev swp1 hwmode swdev learning_sync off
>>
>> And now:
>>
>> $ bridge -d link show dev swp1
>> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
>> master br0 state forwarding priority 32 cost 2
>>      hairpin off guard off root_block off fastleave off learning off flood
>> off
>> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master
>> br0
>>      learning on learning_sync off hwmode swdev
>>
>>
>
> Yes, this is the nice control portion.
> From reviewing the patches, I didnt see how the core to the driver was
> using the  learning_sync. IOW, how do i turn off the drivers sync
> from being activated? Maybe you are doing this in the rocker patches
> which i didnt review? i think this needs to be core infrastructure i.e
> if you are doing this in a timer (as opposed to interrupt driven), then
> the core sync timer would kick in and call some driver ops.
> In any case, details that can be ironed out later..
>
> cheers,
> jamal
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko Nov. 27, 2014, 9:55 p.m. UTC | #31
Thu, Nov 27, 2014 at 09:59:37PM CET, sfeldma@gmail.com wrote:
>Ya right now the driver just doesn't call br_fdb_external_learn_add()
>if LEARNING_SYNC is not set.  It's a port driver setting so it seems
>fine to handle it in the port driver.  You could move the check up to
>br_fdb_external_learn_add(), but then you have an extra call every 1s
>for each fdb entry being refreshed.  (1s or whatever the refresh
>frequency is).  Easier to avoid this overhead and make the decision at
>the source.

I have been thinking about moving the check into bridge code, it to make
it there as well as in drivers. This is easily changeable on demenad
later though, so I left this for now.

>
>-scott
>
>On Thu, Nov 27, 2014 at 2:14 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>> On 11/27/14 01:50, Scott Feldman wrote:
>>
>> [..]
>>
>>>
>>> It's there: IFLA_BRPORT_LEARNING_SYNC.  From iproute2:
>>>
>>> $ bridge -d link show dev swp1
>>> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
>>> master br0 state forwarding priority 32 cost 2
>>>      hairpin off guard off root_block off fastleave off learning off flood
>>> off
>>> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master
>>> br0
>>>      learning on learning_sync on hwmode swdev
>>>
>>> Turn it off:
>>>
>>> $ bridge link set dev swp1 hwmode swdev learning_sync off
>>>
>>> And now:
>>>
>>> $ bridge -d link show dev swp1
>>> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
>>> master br0 state forwarding priority 32 cost 2
>>>      hairpin off guard off root_block off fastleave off learning off flood
>>> off
>>> 2: swp1 state UNKNOWN : <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 master
>>> br0
>>>      learning on learning_sync off hwmode swdev
>>>
>>>
>>
>> Yes, this is the nice control portion.
>> From reviewing the patches, I didnt see how the core to the driver was
>> using the  learning_sync. IOW, how do i turn off the drivers sync
>> from being activated? Maybe you are doing this in the rocker patches
>> which i didnt review? i think this needs to be core infrastructure i.e
>> if you are doing this in a timer (as opposed to interrupt driven), then
>> the core sync timer would kick in and call some driver ops.
>> In any case, details that can be ironed out later..
>>
>> cheers,
>> jamal
>>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu Nov. 28, 2014, 10:14 a.m. UTC | #32
On 11/25/14, 6:36 PM, Scott Feldman wrote:
> On Tue, Nov 25, 2014 at 6:50 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>> On 11/25/14 11:30, John Fastabend wrote:
>>> On 11/25/2014 08:18 AM, Jamal Hadi Salim wrote:
>>>> On 11/25/14 11:01, John Fastabend wrote:
>>>>> On 11/25/2014 07:38 AM, Jamal Hadi Salim wrote:
>>>>>> On 11/25/14 05:28, Jiri Pirko wrote:
>>>>>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass
>>>>>>> simple
>>>>>>> u16 vid to drivers from there.
>>>>>>>
>>
>>> Actually (after having some coffee) this becomes much more useful
>>> if you return which items failed. Then you can slam the hardware
>>> with your 100 entries, probably a lot more then that, and come back
>>> later and clean it up.
>>>
>> Yes, that is the general use case.
>> Unfortunately at the moment we only return codes on a netlink set
>> direction - but would be a beauty if we could return what succeeded
>> and didnt in some form of vector.
>> Note: all is not lost because you can always do a get afterwards and
>> find what is missing if you got a return code of "partial success".
>> Just a little less efficient..
>>
>>
>>> We return a bitmask of which operations were successful. So if SW fails
>>> we have both bits cleared and we abort. When SW is successful we set the
>>> SW bit and try to program the HW. If its sucessful we set the HW bit if
>>> its not we abort with an err. Converting this to (1) is not much work
>>> just skip the abort.
>>>
>> Ok, guess i am gonna have to go stare at the code some more.
>> I thought we returned one of the error codes?
>> A bitmask would work for a single entry - because you have two
>> options add to h/ware and/or s/ware. So response is easy to encode.
>> But if i have 1000 and they are sparsely populated (think an indexed
>> table and i have indices 1, 23, 45, etc), then a bitmask would be
>> hard to use.
> I'm confused by this discussion.  Do I have this right: You want to
> send 1000 RTM_NEWNEIGHs to PF_BRIDGE with both NTF_MASTER and NTF_SELF
> set such that 1000 new FBD entries are installed in both (SW) the
> bridge's FDB and (HW) the port driver's FDB.  My first confusion is
> why do you want these FBD entries in bridge's FDB?  We're offloading
> the switching to HW so HW should be handling fwd plane.  If ctrl pkt
> make it to SW, it can learn those FDB entries; no need for manual
> install of FDB entry in SW.  It seems to me you only want to use
> NTF_SELF to install the FDB entry in HW using the port driver.  And an
> error code is returned for that install.  Since there is only one
> target (NTF_SELF) there is no need for bitmask return.
>
scott, we do have such usecase today. ie , a fdb entry with both 
NTF_MASTER and NTF_SELF set.
And these fdb entries can come from an external controller. The path to 
get them to the hw is via the kernel.
The controller can use `bridge fdb add` to add the fdb entries to the 
kernel (with NTF_MASTER) and also indicate in the same message to add 
the fdb entry to hw (with NTF_SELF). And in this model it is assumed 
that the kernel fdb and hw fdb are in sync.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman Nov. 28, 2014, 10:33 a.m. UTC | #33
On Fri, Nov 28, 2014 at 2:14 AM, Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
> On 11/25/14, 6:36 PM, Scott Feldman wrote:
>>
>> On Tue, Nov 25, 2014 at 6:50 AM, Jamal Hadi Salim <jhs@mojatatu.com>
>> wrote:
>>>
>>> On 11/25/14 11:30, John Fastabend wrote:
>>>>
>>>> On 11/25/2014 08:18 AM, Jamal Hadi Salim wrote:
>>>>>
>>>>> On 11/25/14 11:01, John Fastabend wrote:
>>>>>>
>>>>>> On 11/25/2014 07:38 AM, Jamal Hadi Salim wrote:
>>>>>>>
>>>>>>> On 11/25/14 05:28, Jiri Pirko wrote:
>>>>>>>>
>>>>>>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass
>>>>>>>> simple
>>>>>>>> u16 vid to drivers from there.
>>>>>>>>
>>>
>>>> Actually (after having some coffee) this becomes much more useful
>>>> if you return which items failed. Then you can slam the hardware
>>>> with your 100 entries, probably a lot more then that, and come back
>>>> later and clean it up.
>>>>
>>> Yes, that is the general use case.
>>> Unfortunately at the moment we only return codes on a netlink set
>>> direction - but would be a beauty if we could return what succeeded
>>> and didnt in some form of vector.
>>> Note: all is not lost because you can always do a get afterwards and
>>> find what is missing if you got a return code of "partial success".
>>> Just a little less efficient..
>>>
>>>
>>>> We return a bitmask of which operations were successful. So if SW fails
>>>> we have both bits cleared and we abort. When SW is successful we set the
>>>> SW bit and try to program the HW. If its sucessful we set the HW bit if
>>>> its not we abort with an err. Converting this to (1) is not much work
>>>> just skip the abort.
>>>>
>>> Ok, guess i am gonna have to go stare at the code some more.
>>> I thought we returned one of the error codes?
>>> A bitmask would work for a single entry - because you have two
>>> options add to h/ware and/or s/ware. So response is easy to encode.
>>> But if i have 1000 and they are sparsely populated (think an indexed
>>> table and i have indices 1, 23, 45, etc), then a bitmask would be
>>> hard to use.
>>
>> I'm confused by this discussion.  Do I have this right: You want to
>> send 1000 RTM_NEWNEIGHs to PF_BRIDGE with both NTF_MASTER and NTF_SELF
>> set such that 1000 new FBD entries are installed in both (SW) the
>> bridge's FDB and (HW) the port driver's FDB.  My first confusion is
>> why do you want these FBD entries in bridge's FDB?  We're offloading
>> the switching to HW so HW should be handling fwd plane.  If ctrl pkt
>> make it to SW, it can learn those FDB entries; no need for manual
>> install of FDB entry in SW.  It seems to me you only want to use
>> NTF_SELF to install the FDB entry in HW using the port driver.  And an
>> error code is returned for that install.  Since there is only one
>> target (NTF_SELF) there is no need for bitmask return.
>>
> scott, we do have such usecase today. ie , a fdb entry with both NTF_MASTER
> and NTF_SELF set.
> And these fdb entries can come from an external controller. The path to get
> them to the hw is via the kernel.
> The controller can use `bridge fdb add` to add the fdb entries to the kernel
> (with NTF_MASTER) and also indicate in the same message to add the fdb entry
> to hw (with NTF_SELF). And in this model it is assumed that the kernel fdb
> and hw fdb are in sync.

Ya, I understood that from Jamal's explanation.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Nov. 28, 2014, 12:57 p.m. UTC | #34
On 11/27/14 16:55, Jiri Pirko wrote:
> Thu, Nov 27, 2014 at 09:59:37PM CET, sfeldma@gmail.com wrote:
>> Ya right now the driver just doesn't call br_fdb_external_learn_add()
>> if LEARNING_SYNC is not set.  It's a port driver setting so it seems
>> fine to handle it in the port driver.  You could move the check up to
>> br_fdb_external_learn_add(), but then you have an extra call every 1s
>> for each fdb entry being refreshed.  (1s or whatever the refresh
>> frequency is).  Easier to avoid this overhead and make the decision at
>> the source.
>
> I have been thinking about moving the check into bridge code, it to make
> it there as well as in drivers. This is easily changeable on demenad
> later though, so I left this for now.
>

It seems more comfortable to move to the core if you are doing polling
with timers... i.e you kick it per-offload via some timer. The arming
being done by the setting of LEARNING_SYNC
There are cases where all this is done via interrupts. i.e the hardware
will issue an interrupt only then do you poll..

Maybe Scott's approach is the correct one. I am indifferent to be
honest, these are some of those things that can be easily refactored
as more hardware shows up.

cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Dec. 9, 2014, 11:57 a.m. UTC | #35
On Tue, Nov 25, 2014 at 5:43 PM, John Fastabend
<john.fastabend@gmail.com> wrote:
> On 11/25/2014 07:18 AM, Jiri Pirko wrote:
>>
>> Tue, Nov 25, 2014 at 04:13:12PM CET, gospo@cumulusnetworks.com wrote:
>>>
>>> On Tue, Nov 25, 2014 at 11:28:33AM +0100, Jiri Pirko wrote:
>>>>
>>>> Do the work of parsing NDA_VLAN directly in rtnetlink code, pass simple
>>>> u16 vid to drivers from there.
>>>>
>>>> Signed-off-by: Jiri Pirko <jiri@resnulli.us>
>>>
>>>
>>> Structurally this looks fine, just a misspelling noted below.
>>>
>>> Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
>>>
>
> If your going to spin this, should we return an error from
> ndo_dflt_fdb_add() when we have a non-zero vid? The dflt
> handler uses the dev_(mc|uc)_add_excl routines which will
> not consume vids.

so... was this comment addressed along the discussion? I see in the
code that we don't check
on the _dflt_ handlers nor on the per device ones (ixgbe, i40e, qlgc)
for a valid VID and return
error on that.


> If you want to address this with a follow up patch I'm OK
> with that. Go ahead and add my ack,
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 7262077..5ed5e40 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7536,7 +7536,7 @@  static int i40e_get_phys_port_id(struct net_device *netdev,
  */
 static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			    struct net_device *dev,
-			    const unsigned char *addr,
+			    const unsigned char *addr, u16 vid,
 			    u16 flags)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 932f779..1bad9f4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7708,7 +7708,7 @@  static int ixgbe_set_features(struct net_device *netdev,
 
 static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			     struct net_device *dev,
-			     const unsigned char *addr,
+			     const unsigned char *addr, u16 vid,
 			     u16 flags)
 {
 	/* guarantee we can provide a unique filter for the unicast address */
@@ -7717,7 +7717,7 @@  static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			return -ENOMEM;
 	}
 
-	return ndo_dflt_fdb_add(ndm, tb, dev, addr, flags);
+	return ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, flags);
 }
 
 static int ixgbe_ndo_bridge_setlink(struct net_device *dev,
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index a913b3a..3227c80 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -376,13 +376,14 @@  static int qlcnic_set_mac(struct net_device *netdev, void *p)
 }
 
 static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-			struct net_device *netdev, const unsigned char *addr)
+			struct net_device *netdev,
+			const unsigned char *addr, u16 vid)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	int err = -EOPNOTSUPP;
 
 	if (!adapter->fdb_mac_learn)
-		return ndo_dflt_fdb_del(ndm, tb, netdev, addr);
+		return ndo_dflt_fdb_del(ndm, tb, netdev, addr, vid);
 
 	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED) ||
 	    qlcnic_sriov_check(adapter)) {
@@ -401,13 +402,13 @@  static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 
 static int qlcnic_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			struct net_device *netdev,
-			const unsigned char *addr, u16 flags)
+			const unsigned char *addr, u16 vid, u16 flags)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	int err = 0;
 
 	if (!adapter->fdb_mac_learn)
-		return ndo_dflt_fdb_add(ndm, tb, netdev, addr, flags);
+		return ndo_dflt_fdb_add(ndm, tb, netdev, addr, vid, flags);
 
 	if (!(adapter->flags & QLCNIC_ESWITCH_ENABLED) &&
 	    !qlcnic_sriov_check(adapter)) {
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index bfb0b6e..a1a3e3e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -872,7 +872,7 @@  static int macvlan_vlan_rx_kill_vid(struct net_device *dev,
 
 static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev,
-			   const unsigned char *addr,
+			   const unsigned char *addr, u16 vid,
 			   u16 flags)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
@@ -897,7 +897,7 @@  static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 
 static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev,
-			   const unsigned char *addr)
+			   const unsigned char *addr, u16 vid)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	int err = -EINVAL;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e9f81d4..7d8013d 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -849,7 +849,7 @@  static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
 /* Add static entry (via netlink) */
 static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			 struct net_device *dev,
-			 const unsigned char *addr, u16 flags)
+			 const unsigned char *addr, u16 vid, u16 flags)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	/* struct net *net = dev_net(vxlan->dev); */
@@ -885,7 +885,7 @@  static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 /* Delete entry (via netlink) */
 static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 			    struct net_device *dev,
-			    const unsigned char *addr)
+			    const unsigned char *addr, u16 vid)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5cd5087..fab074e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -951,11 +951,11 @@  typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *
  * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
  *		      struct net_device *dev,
- *		      const unsigned char *addr, u16 flags)
+ *		      const unsigned char *addr, u16 vid, u16 flags)
  *	Adds an FDB entry to dev for addr.
  * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
  *		      struct net_device *dev,
- *		      const unsigned char *addr)
+ *		      const unsigned char *addr, u16 vid)
  *	Deletes the FDB entry from dev coresponding to addr.
  * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
  *		       struct net_device *dev, struct net_device *filter_dev,
@@ -1128,11 +1128,13 @@  struct net_device_ops {
 					       struct nlattr *tb[],
 					       struct net_device *dev,
 					       const unsigned char *addr,
+					       u16 vid,
 					       u16 flags);
 	int			(*ndo_fdb_del)(struct ndmsg *ndm,
 					       struct nlattr *tb[],
 					       struct net_device *dev,
-					       const unsigned char *addr);
+					       const unsigned char *addr,
+					       u16 vid);
 	int			(*ndo_fdb_dump)(struct sk_buff *skb,
 						struct netlink_callback *cb,
 						struct net_device *dev,
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 6cacbce..063f0f5 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -94,11 +94,13 @@  extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
 			    struct nlattr *tb[],
 			    struct net_device *dev,
 			    const unsigned char *addr,
-			     u16 flags);
+			    u16 vid,
+			    u16 flags);
 extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
 			    struct nlattr *tb[],
 			    struct net_device *dev,
-			    const unsigned char *addr);
+			    const unsigned char *addr,
+			    u16 vid);
 
 extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev, u16 mode);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 08ef4e7..b1be971 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -805,33 +805,17 @@  static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge_port *p,
 /* Add new permanent fdb entry with RTM_NEWNEIGH */
 int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	       struct net_device *dev,
-	       const unsigned char *addr, u16 nlh_flags)
+	       const unsigned char *addr, u16 vid, u16 nlh_flags)
 {
 	struct net_bridge_port *p;
 	int err = 0;
 	struct net_port_vlans *pv;
-	unsigned short vid = VLAN_N_VID;
 
 	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
 		pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
 		return -EINVAL;
 	}
 
-	if (tb[NDA_VLAN]) {
-		if (nla_len(tb[NDA_VLAN]) != sizeof(unsigned short)) {
-			pr_info("bridge: RTM_NEWNEIGH with invalid vlan\n");
-			return -EINVAL;
-		}
-
-		vid = nla_get_u16(tb[NDA_VLAN]);
-
-		if (!vid || vid >= VLAN_VID_MASK) {
-			pr_info("bridge: RTM_NEWNEIGH with invalid vlan id %d\n",
-				vid);
-			return -EINVAL;
-		}
-	}
-
 	if (is_zero_ether_addr(addr)) {
 		pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n");
 		return -EINVAL;
@@ -845,7 +829,7 @@  int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	}
 
 	pv = nbp_get_vlan_info(p);
-	if (vid != VLAN_N_VID) {
+	if (vid) {
 		if (!pv || !test_bit(vid, pv->vlan_bitmap)) {
 			pr_info("bridge: RTM_NEWNEIGH with unconfigured "
 				"vlan %d on port %s\n", vid, dev->name);
@@ -903,27 +887,12 @@  static int __br_fdb_delete(struct net_bridge_port *p,
 /* Remove neighbor entry with RTM_DELNEIGH */
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 		  struct net_device *dev,
-		  const unsigned char *addr)
+		  const unsigned char *addr, u16 vid)
 {
 	struct net_bridge_port *p;
 	int err;
 	struct net_port_vlans *pv;
-	unsigned short vid = VLAN_N_VID;
-
-	if (tb[NDA_VLAN]) {
-		if (nla_len(tb[NDA_VLAN]) != sizeof(unsigned short)) {
-			pr_info("bridge: RTM_NEWNEIGH with invalid vlan\n");
-			return -EINVAL;
-		}
 
-		vid = nla_get_u16(tb[NDA_VLAN]);
-
-		if (!vid || vid >= VLAN_VID_MASK) {
-			pr_info("bridge: RTM_NEWNEIGH with invalid vlan id %d\n",
-				vid);
-			return -EINVAL;
-		}
-	}
 	p = br_port_get_rtnl(dev);
 	if (p == NULL) {
 		pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n",
@@ -932,7 +901,7 @@  int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	}
 
 	pv = nbp_get_vlan_info(p);
-	if (vid != VLAN_N_VID) {
+	if (vid) {
 		if (!pv || !test_bit(vid, pv->vlan_bitmap)) {
 			pr_info("bridge: RTM_DELNEIGH with unconfigured "
 				"vlan %d on port %s\n", vid, dev->name);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8f3f081..4f577c4 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -404,9 +404,9 @@  void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 		   const unsigned char *addr, u16 vid, bool added_by_user);
 
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
-		  struct net_device *dev, const unsigned char *addr);
+		  struct net_device *dev, const unsigned char *addr, u16 vid);
 int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
-	       const unsigned char *addr, u16 nlh_flags);
+	       const unsigned char *addr, u16 vid, u16 nlh_flags);
 int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		struct net_device *dev, struct net_device *fdev, int idx);
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a688268..f2a4b38 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -36,6 +36,7 @@ 
 #include <linux/mutex.h>
 #include <linux/if_addr.h>
 #include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
 
@@ -2312,7 +2313,7 @@  errout:
 int ndo_dflt_fdb_add(struct ndmsg *ndm,
 		     struct nlattr *tb[],
 		     struct net_device *dev,
-		     const unsigned char *addr,
+		     const unsigned char *addr, u16 vid,
 		     u16 flags)
 {
 	int err = -EINVAL;
@@ -2338,6 +2339,28 @@  int ndo_dflt_fdb_add(struct ndmsg *ndm,
 }
 EXPORT_SYMBOL(ndo_dflt_fdb_add);
 
+static int fbd_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
+{
+	u16 vid = 0;
+
+	if (vlan_attr) {
+		if (nla_len(vlan_attr) != sizeof(u16)) {
+			pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
+			return -EINVAL;
+		}
+
+		vid = nla_get_u16(vlan_attr);
+
+		if (!vid || vid >= VLAN_VID_MASK) {
+			pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
+				vid);
+			return -EINVAL;
+		}
+	}
+	*p_vid = vid;
+	return 0;
+}
+
 static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
@@ -2345,6 +2368,7 @@  static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 	struct nlattr *tb[NDA_MAX+1];
 	struct net_device *dev;
 	u8 *addr;
+	u16 vid;
 	int err;
 
 	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
@@ -2370,6 +2394,10 @@  static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	addr = nla_data(tb[NDA_LLADDR]);
 
+	err = fbd_vid_parse(tb[NDA_VLAN], &vid);
+	if (err)
+		return err;
+
 	err = -EOPNOTSUPP;
 
 	/* Support fdb on master device the net/bridge default case */
@@ -2378,7 +2406,8 @@  static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 		struct net_device *br_dev = netdev_master_upper_dev_get(dev);
 		const struct net_device_ops *ops = br_dev->netdev_ops;
 
-		err = ops->ndo_fdb_add(ndm, tb, dev, addr, nlh->nlmsg_flags);
+		err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
+				       nlh->nlmsg_flags);
 		if (err)
 			goto out;
 		else
@@ -2389,9 +2418,10 @@  static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if ((ndm->ndm_flags & NTF_SELF)) {
 		if (dev->netdev_ops->ndo_fdb_add)
 			err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+							   vid,
 							   nlh->nlmsg_flags);
 		else
-			err = ndo_dflt_fdb_add(ndm, tb, dev, addr,
+			err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
 					       nlh->nlmsg_flags);
 
 		if (!err) {
@@ -2409,7 +2439,7 @@  out:
 int ndo_dflt_fdb_del(struct ndmsg *ndm,
 		     struct nlattr *tb[],
 		     struct net_device *dev,
-		     const unsigned char *addr)
+		     const unsigned char *addr, u16 vid)
 {
 	int err = -EINVAL;
 
@@ -2438,6 +2468,7 @@  static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 	struct net_device *dev;
 	int err = -EINVAL;
 	__u8 *addr;
+	u16 vid;
 
 	if (!netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
@@ -2465,6 +2496,10 @@  static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	addr = nla_data(tb[NDA_LLADDR]);
 
+	err = fbd_vid_parse(tb[NDA_VLAN], &vid);
+	if (err)
+		return err;
+
 	err = -EOPNOTSUPP;
 
 	/* Support fdb on master device the net/bridge default case */
@@ -2474,7 +2509,7 @@  static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 		const struct net_device_ops *ops = br_dev->netdev_ops;
 
 		if (ops->ndo_fdb_del)
-			err = ops->ndo_fdb_del(ndm, tb, dev, addr);
+			err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
 
 		if (err)
 			goto out;
@@ -2485,9 +2520,10 @@  static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
 	/* Embedded bridge, macvlan, and any other device support */
 	if (ndm->ndm_flags & NTF_SELF) {
 		if (dev->netdev_ops->ndo_fdb_del)
-			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
+							   vid);
 		else
-			err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
+			err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
 
 		if (!err) {
 			rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);