diff mbox

[net-next,2/2] mv88e6131: bonding: implement single device trunking

Message ID 1424429473-4601-3-git-send-email-jonasj76@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Jonas Johansson Feb. 20, 2015, 10:51 a.m. UTC
From: Jonas Johansson <jonas.johansson@westermo.se>

This patch will use the DSA hardware bonding support hooks to setup trunking
for the Marvell 88E6095 device. The implementation only handles trunking in
a single device.

Hooks:
 .bond_add_group: Add port to a bond group
 .bond_del_group: Remove port from a bond group
 .bond_attach: Attach/activate port in bond group
 .bond_detach: Detach/inactivate port in bond group

Procedure to add/remome port from bond group:
 Setup trunk learning (Port Association Vector)
 Setup loop prevention (VLAN Table)
 Setup load balancing (Trunk Mask Load Balance Table)

Procedure to attach/detach port:
 Change load balancing (Trunk Mask Load Balance Table)

Signed-off-by: Jonas Johansson <jonas.johansson@westermo.se>
---
 drivers/net/dsa/mv88e6131.c | 254 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx.h |  14 +++
 2 files changed, 268 insertions(+)

Comments

Andrew Lunn Feb. 20, 2015, 3:26 p.m. UTC | #1
On Fri, Feb 20, 2015 at 11:51:13AM +0100, Jonas Johansson wrote:
> From: Jonas Johansson <jonas.johansson@westermo.se>
> 
> This patch will use the DSA hardware bonding support hooks to setup trunking
> for the Marvell 88E6095 device. The implementation only handles trunking in
> a single device.
> 
> Hooks:
>  .bond_add_group: Add port to a bond group
>  .bond_del_group: Remove port from a bond group
>  .bond_attach: Attach/activate port in bond group
>  .bond_detach: Detach/inactivate port in bond group
> 
> Procedure to add/remome port from bond group:
>  Setup trunk learning (Port Association Vector)
>  Setup loop prevention (VLAN Table)
>  Setup load balancing (Trunk Mask Load Balance Table)
> 
> Procedure to attach/detach port:
>  Change load balancing (Trunk Mask Load Balance Table)
> 
> Signed-off-by: Jonas Johansson <jonas.johansson@westermo.se>
> ---
>  drivers/net/dsa/mv88e6131.c | 254 ++++++++++++++++++++++++++++++++++++++++++++
>  drivers/net/dsa/mv88e6xxx.h |  14 +++
>  2 files changed, 268 insertions(+)
> 
> diff --git a/drivers/net/dsa/mv88e6131.c b/drivers/net/dsa/mv88e6131.c
> index 2540ef0..3ba7a0c 100644
> --- a/drivers/net/dsa/mv88e6131.c
> +++ b/drivers/net/dsa/mv88e6131.c
> @@ -382,6 +382,256 @@ mv88e6131_get_ethtool_stats(struct dsa_switch *ds,
>  				    mv88e6131_hw_stats, port, data);
>  }
>  
> +/* Trunking */
> +static int mv88e6131_bond_set_trunk_learning(struct dsa_switch *ds,
> +					     int *ports, size_t num)
> +{
> +	u16 port_vec = 0;
> +	int ret;
> +	int i;
> +
> +	num = num < MAX_PORTS ? num : MAX_PORTS;
> +
> +	for (i = 0; i < num; i++)
> +		port_vec |= 1 << ports[i];
> +
> +	for (i = 0; i < num; i++) {
> +		ret = mv88e6xxx_reg_read(ds, REG_PORT(ports[i]), REG_PORT_PAV);
> +		if (ret < 0)
> +			continue;
> +		ret = (ret & 0xf800) | (port_vec & 0x7ff);
> +		mv88e6xxx_reg_write(ds, REG_PORT(ports[i]), REG_PORT_PAV, ret);
> +	}
> +
> +	return 0;
> +}

The mv886060 seems to have the PAV register. So i guess most of the
Marvell switches support this. Is there anything specific to the 6131
here? Could this be moved into mv88x6xxx so other switch drivers can
use it?

> +
> +static int mv88e6131_bond_set_loop_prevention(struct dsa_switch *ds,
> +					      int *ports, size_t num)
> +{
> +	u16 port_vec = 0;
> +	int ret;
> +	int i;
> +
> +	num = num < MAX_PORTS ? num : MAX_PORTS;
> +
> +	for (i = 0; i < num; i++)
> +		port_vec |= 1 << ports[i];
> +
> +	for (i = 0; i < num; i++) {
> +		ret = mv88e6xxx_reg_read(ds, REG_PORT(ports[i]), REG_PORT_VLAN_MAP);
> +		if (ret < 0)
> +			continue;
> +		ret &= ~port_vec & 0x7ff;
> +		mv88e6xxx_reg_write(ds, REG_PORT(ports[i]), REG_PORT_VLAN_MAP, ret);
> +	}
> +
> +	return 0;
> +}

Same question again, anything specific to the 6131 here?

> +static int mv88e6131_wait_trunk_mask(struct dsa_switch *ds)
> +{
> +	const int max_retries = 10;
> +	int retries = 0;
> +	int ret;
> +
> +	/* Wait for update Trunk Mask data */
> +	while (1) {
> +		ret = REG_READ(REG_GLOBAL2, REG_TRUNK_MASK);
> +		if (!(ret & 0x8000))
> +			return ret;
> +		if (retries > max_retries) {
> +			pr_warn("mv88e6131: Timeout waiting for "
> +				"Trunk Mask Table Register Update\n");
> +			return -EBUSY;
> +		}
> +		retries++;
> +		usleep_range(20, 50);
> +	};

This looks a lot like the wait functions what Guenter Roeck added to
6352 and i just moved to mv88e6xxx.c. Please use the generic
infrastructure in the shared code.

Please could you look at all your functions and see what is specific
to the 6131 and what is generic. Place the generic code into mv88e6xxx
please so we can all use it.

       Thanks
	Andrew
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jonas Johansson Feb. 20, 2015, 3:56 p.m. UTC | #2
On Fri, 20 Feb 2015, Andrew Lunn wrote:

> On Fri, Feb 20, 2015 at 11:51:13AM +0100, Jonas Johansson wrote:
>> From: Jonas Johansson <jonas.johansson@westermo.se>
>>
>> This patch will use the DSA hardware bonding support hooks to setup trunking
>> for the Marvell 88E6095 device. The implementation only handles trunking in
>> a single device.
>>
>> Hooks:
>>  .bond_add_group: Add port to a bond group
>>  .bond_del_group: Remove port from a bond group
>>  .bond_attach: Attach/activate port in bond group
>>  .bond_detach: Detach/inactivate port in bond group
>>
>> Procedure to add/remome port from bond group:
>>  Setup trunk learning (Port Association Vector)
>>  Setup loop prevention (VLAN Table)
>>  Setup load balancing (Trunk Mask Load Balance Table)
>>
>> Procedure to attach/detach port:
>>  Change load balancing (Trunk Mask Load Balance Table)
>>
>> Signed-off-by: Jonas Johansson <jonas.johansson@westermo.se>
>> ---
>>  drivers/net/dsa/mv88e6131.c | 254 ++++++++++++++++++++++++++++++++++++++++++++
>>  drivers/net/dsa/mv88e6xxx.h |  14 +++
>>  2 files changed, 268 insertions(+)
>>
>> diff --git a/drivers/net/dsa/mv88e6131.c b/drivers/net/dsa/mv88e6131.c
>> index 2540ef0..3ba7a0c 100644
>> --- a/drivers/net/dsa/mv88e6131.c
>> +++ b/drivers/net/dsa/mv88e6131.c
>> @@ -382,6 +382,256 @@ mv88e6131_get_ethtool_stats(struct dsa_switch *ds,
>>  				    mv88e6131_hw_stats, port, data);
>>  }
>>
>> +/* Trunking */
>> +static int mv88e6131_bond_set_trunk_learning(struct dsa_switch *ds,
>> +					     int *ports, size_t num)
>> +{
>> +	u16 port_vec = 0;
>> +	int ret;
>> +	int i;
>> +
>> +	num = num < MAX_PORTS ? num : MAX_PORTS;
>> +
>> +	for (i = 0; i < num; i++)
>> +		port_vec |= 1 << ports[i];
>> +
>> +	for (i = 0; i < num; i++) {
>> +		ret = mv88e6xxx_reg_read(ds, REG_PORT(ports[i]), REG_PORT_PAV);
>> +		if (ret < 0)
>> +			continue;
>> +		ret = (ret & 0xf800) | (port_vec & 0x7ff);
>> +		mv88e6xxx_reg_write(ds, REG_PORT(ports[i]), REG_PORT_PAV, ret);
>> +	}
>> +
>> +	return 0;
>> +}
>
> The mv886060 seems to have the PAV register. So i guess most of the
> Marvell switches support this. Is there anything specific to the 6131
> here? Could this be moved into mv88x6xxx so other switch drivers can
> use it?
>
I guess you are correct, i'll look into it.
>> +
>> +static int mv88e6131_bond_set_loop_prevention(struct dsa_switch *ds,
>> +					      int *ports, size_t num)
>> +{
>> +	u16 port_vec = 0;
>> +	int ret;
>> +	int i;
>> +
>> +	num = num < MAX_PORTS ? num : MAX_PORTS;
>> +
>> +	for (i = 0; i < num; i++)
>> +		port_vec |= 1 << ports[i];
>> +
>> +	for (i = 0; i < num; i++) {
>> +		ret = mv88e6xxx_reg_read(ds, REG_PORT(ports[i]), REG_PORT_VLAN_MAP);
>> +		if (ret < 0)
>> +			continue;
>> +		ret &= ~port_vec & 0x7ff;
>> +		mv88e6xxx_reg_write(ds, REG_PORT(ports[i]), REG_PORT_VLAN_MAP, ret);
>> +	}
>> +
>> +	return 0;
>> +}
>
> Same question again, anything specific to the 6131 here?
>
I'll check.
>> +static int mv88e6131_wait_trunk_mask(struct dsa_switch *ds)
>> +{
>> +	const int max_retries = 10;
>> +	int retries = 0;
>> +	int ret;
>> +
>> +	/* Wait for update Trunk Mask data */
>> +	while (1) {
>> +		ret = REG_READ(REG_GLOBAL2, REG_TRUNK_MASK);
>> +		if (!(ret & 0x8000))
>> +			return ret;
>> +		if (retries > max_retries) {
>> +			pr_warn("mv88e6131: Timeout waiting for "
>> +				"Trunk Mask Table Register Update\n");
>> +			return -EBUSY;
>> +		}
>> +		retries++;
>> +		usleep_range(20, 50);
>> +	};
>
> This looks a lot like the wait functions what Guenter Roeck added to
> 6352 and i just moved to mv88e6xxx.c. Please use the generic
> infrastructure in the shared code.
>
Seems like the function is doing exactly what I'm looking for.
> Please could you look at all your functions and see what is specific
> to the 6131 and what is generic. Place the generic code into mv88e6xxx
> please so we can all use it.
>
>       Thanks
> 	Andrew
>
Thanks for your review, I'll check if the functions is specific to the 
6131 or if it the code could be moved to mv88e6xxx.c.
On vacation a.t.m, back in a week.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Fainelli March 6, 2015, 5:06 p.m. UTC | #3
On 20/02/15 07:26, Andrew Lunn wrote:
> On Fri, Feb 20, 2015 at 11:51:13AM +0100, Jonas Johansson wrote:
> 
> Please could you look at all your functions and see what is specific
> to the 6131 and what is generic. Place the generic code into mv88e6xxx
> please so we can all use it.

Out of curiosity, how many bonding/trunking groups are supported on the
switches models currently in tree?

Let's say there is a limitation: like no more than 2 different bonding
groups on a given physical switch, where would we put this limitation,
down to the switch driver?
Andrew Lunn March 6, 2015, 7:23 p.m. UTC | #4
On Fri, Mar 06, 2015 at 09:06:50AM -0800, Florian Fainelli wrote:
> On 20/02/15 07:26, Andrew Lunn wrote:
> > On Fri, Feb 20, 2015 at 11:51:13AM +0100, Jonas Johansson wrote:
> > 
> > Please could you look at all your functions and see what is specific
> > to the 6131 and what is generic. Place the generic code into mv88e6xxx
> > please so we can all use it.
> 
> Out of curiosity, how many bonding/trunking groups are supported on the
> switches models currently in tree?
> 
> Let's say there is a limitation: like no more than 2 different bonding
> groups on a given physical switch, where would we put this limitation,
> down to the switch driver?

Hi Florian

It is limited, but it seems to be quite a high limit. I don't have
exact numbers for the devices currently in tree, but i've used an 10
port switch which had a limit something like 8 trunk groups, and a
maximum of 8 ports per trunk.

I suspect we would put the resource tracking in the shared mv88e6xxx
code, and configure it with the limitations from the specific chip
driver.

However, does SF2 have similar trunking capabilities, and limits? Does
it make sense to have this at a higher level so it can be used by all
drivers?

	Andrew
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Fainelli March 6, 2015, 8:47 p.m. UTC | #5
On 06/03/15 11:23, Andrew Lunn wrote:
> On Fri, Mar 06, 2015 at 09:06:50AM -0800, Florian Fainelli wrote:
>> On 20/02/15 07:26, Andrew Lunn wrote:
>>> On Fri, Feb 20, 2015 at 11:51:13AM +0100, Jonas Johansson wrote:
>>>
>>> Please could you look at all your functions and see what is specific
>>> to the 6131 and what is generic. Place the generic code into mv88e6xxx
>>> please so we can all use it.
>>
>> Out of curiosity, how many bonding/trunking groups are supported on the
>> switches models currently in tree?
>>
>> Let's say there is a limitation: like no more than 2 different bonding
>> groups on a given physical switch, where would we put this limitation,
>> down to the switch driver?
> 
> Hi Florian
> 
> It is limited, but it seems to be quite a high limit. I don't have
> exact numbers for the devices currently in tree, but i've used an 10
> port switch which had a limit something like 8 trunk groups, and a
> maximum of 8 ports per trunk.
> 
> I suspect we would put the resource tracking in the shared mv88e6xxx
> code, and configure it with the limitations from the specific chip
> driver.
> 
> However, does SF2 have similar trunking capabilities, and limits? Does
> it make sense to have this at a higher level so it can be used by all
> drivers?

Most Broadcom switches, either SF2 or roboswitch (b53) have a limit of 2
trunking groups, without limitations on the number of ports included in
any of these two groups.

The larger question is once we start advertising capabilities, where
does that stop, right? It would probably be simpler for now to e.g:
allow 2 trunking groups to be configured, and when trying to configure a
3rd one, return -ENOSPC and act upon that to either take the software
slow path (which is probably not possible) or just return a hard error
condition.
Andrew Lunn March 6, 2015, 9:47 p.m. UTC | #6
Hi Florian

> Most Broadcom switches, either SF2 or roboswitch (b53) have a limit of 2
> trunking groups, without limitations on the number of ports included in
> any of these two groups.

O.K, so maybe we want the basic resource management in the DSA layer,
not the switch drivers.
 
> The larger question is once we start advertising capabilities, where
> does that stop, right? It would probably be simpler for now to e.g:
> allow 2 trunking groups to be configured, and when trying to configure a
> 3rd one, return -ENOSPC and act upon that to either take the software
> slow path (which is probably not possible) or just return a hard error
> condition.

This is more than a DSA question. It applies to all the hardware
acceleration being discussed at the moment. As you hinted to above, i
suppose we have two different situations:

1) We can fall back to a software slow path.

2) There is no software fallback, we have to error out, and it would
   be nice to have a well defined error code for out of hardware
   resources.

We also should think about how we tell user space we have fallen back
to a slow path. I'm sure users want to know why it works, but works
much slower.

     Andrew
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman March 6, 2015, 10:43 p.m. UTC | #7
On Fri, Mar 6, 2015 at 1:47 PM, Andrew Lunn <andrew@lunn.ch> wrote:
> Hi Florian
>
>> Most Broadcom switches, either SF2 or roboswitch (b53) have a limit of 2
>> trunking groups, without limitations on the number of ports included in
>> any of these two groups.
>
> O.K, so maybe we want the basic resource management in the DSA layer,
> not the switch drivers.
>
>> The larger question is once we start advertising capabilities, where
>> does that stop, right? It would probably be simpler for now to e.g:
>> allow 2 trunking groups to be configured, and when trying to configure a
>> 3rd one, return -ENOSPC and act upon that to either take the software
>> slow path (which is probably not possible) or just return a hard error
>> condition.
>
> This is more than a DSA question. It applies to all the hardware
> acceleration being discussed at the moment. As you hinted to above, i
> suppose we have two different situations:
>
> 1) We can fall back to a software slow path.
>
> 2) There is no software fallback, we have to error out, and it would
>    be nice to have a well defined error code for out of hardware
>    resources.
>
> We also should think about how we tell user space we have fallen back
> to a slow path. I'm sure users want to know why it works, but works
> much slower.

For the general hardware acceleration of bonds, my thoughts for switchdev are:

(Assume 802.3ad LAG setup for discussion sake, other bonding modes are similar).

1. The driver has access to port membership notification via netevent,
so driver knows which ports are in which bonds.  This notification is
passive; there is no way to signal to user that when port was put in a
bond if it was programmed into a device LAG group or not.  It's
totally up to the driver and device resources to make that decision.

2. The driver can know port active status via netevent as well.  If
device has the port in a LAG group, then reflect the active status
down to device port.  Again, a passive notification.

3. CPU-originating egress traffic would be LAGed by bonding (or team)
driver.  (This is true regardless if the device LAG group was setup or
not).

4a. If device LAG group setup, forwarded traffic would be LAG egressed
by device (fast path).

4b. If no device LAG group, ingress traffic is sent to CPU (slow path)
for bonding (or team) to LAG egress.

So software fall-back (4b) is the default case if driver/device can't
setup LAG group for forwarding.

So the question is: how does user know which bonds are accelerated?
So far, we've used the label "external" to mark L2 bridge FDB which
are offloaded to the device and "external" to mark L3 routes offloaded
to the device.  Do we mark bonds as "external" if the LAG path is
offloaded to the device?

-scott
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Pirko March 7, 2015, 2:38 p.m. UTC | #8
Fri, Mar 06, 2015 at 11:43:55PM CET, sfeldma@gmail.com wrote:
>On Fri, Mar 6, 2015 at 1:47 PM, Andrew Lunn <andrew@lunn.ch> wrote:
>> Hi Florian
>>
>>> Most Broadcom switches, either SF2 or roboswitch (b53) have a limit of 2
>>> trunking groups, without limitations on the number of ports included in
>>> any of these two groups.
>>
>> O.K, so maybe we want the basic resource management in the DSA layer,
>> not the switch drivers.
>>
>>> The larger question is once we start advertising capabilities, where
>>> does that stop, right? It would probably be simpler for now to e.g:
>>> allow 2 trunking groups to be configured, and when trying to configure a
>>> 3rd one, return -ENOSPC and act upon that to either take the software
>>> slow path (which is probably not possible) or just return a hard error
>>> condition.
>>
>> This is more than a DSA question. It applies to all the hardware
>> acceleration being discussed at the moment. As you hinted to above, i
>> suppose we have two different situations:
>>
>> 1) We can fall back to a software slow path.
>>
>> 2) There is no software fallback, we have to error out, and it would
>>    be nice to have a well defined error code for out of hardware
>>    resources.
>>
>> We also should think about how we tell user space we have fallen back
>> to a slow path. I'm sure users want to know why it works, but works
>> much slower.
>
>For the general hardware acceleration of bonds, my thoughts for switchdev are:
>
>(Assume 802.3ad LAG setup for discussion sake, other bonding modes are similar).
>
>1. The driver has access to port membership notification via netevent,
>so driver knows which ports are in which bonds.  This notification is
>passive; there is no way to signal to user that when port was put in a
>bond if it was programmed into a device LAG group or not.  It's
>totally up to the driver and device resources to make that decision.

Exactly. The fact if another group can be added or not should be decided
and handled by driver. The driver then notifies higher layers using
switchdev notifier event.
 
>
>2. The driver can know port active status via netevent as well.  If
>device has the port in a LAG group, then reflect the active status
>down to device port.  Again, a passive notification.
>
>3. CPU-originating egress traffic would be LAGed by bonding (or team)
>driver.  (This is true regardless if the device LAG group was setup or
>not).
>
>4a. If device LAG group setup, forwarded traffic would be LAG egressed
>by device (fast path).
>
>4b. If no device LAG group, ingress traffic is sent to CPU (slow path)
>for bonding (or team) to LAG egress.
>
>So software fall-back (4b) is the default case if driver/device can't
>setup LAG group for forwarding.
>
>So the question is: how does user know which bonds are accelerated?
>So far, we've used the label "external" to mark L2 bridge FDB which
>are offloaded to the device and "external" to mark L3 routes offloaded
>to the device.  Do we mark bonds as "external" if the LAG path is
>offloaded to the device?

I believe that this is the way to go. Introduce this flag for bond and
team to signal user if LAG is offloaded or not.


>
>-scott
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend March 7, 2015, 5:31 p.m. UTC | #9
On 03/07/2015 06:38 AM, Jiri Pirko wrote:
> Fri, Mar 06, 2015 at 11:43:55PM CET, sfeldma@gmail.com wrote:
>> On Fri, Mar 6, 2015 at 1:47 PM, Andrew Lunn <andrew@lunn.ch> wrote:
>>> Hi Florian
>>>
>>>> Most Broadcom switches, either SF2 or roboswitch (b53) have a limit of 2
>>>> trunking groups, without limitations on the number of ports included in
>>>> any of these two groups.
>>>
>>> O.K, so maybe we want the basic resource management in the DSA layer,
>>> not the switch drivers.
>>>
>>>> The larger question is once we start advertising capabilities, where
>>>> does that stop, right? It would probably be simpler for now to e.g:
>>>> allow 2 trunking groups to be configured, and when trying to configure a
>>>> 3rd one, return -ENOSPC and act upon that to either take the software
>>>> slow path (which is probably not possible) or just return a hard error
>>>> condition.
>>>
>>> This is more than a DSA question. It applies to all the hardware
>>> acceleration being discussed at the moment. As you hinted to above, i
>>> suppose we have two different situations:
>>>
>>> 1) We can fall back to a software slow path.
>>>
>>> 2) There is no software fallback, we have to error out, and it would
>>>     be nice to have a well defined error code for out of hardware
>>>     resources.
>>>

I have a (3) case where we don't ever want to fall back to slow path
even if it is possible and (4) we don't ever want to offload the
operation even when it is possible. Having to program the device then
unwind it from user space seems a bit error prone and ugly. At some
point I think we will have to handle these cases its probably fine to
get the transparent offload working as a start though. Specifically,
talking about LAG I'm not sure of the use case for (4) but in
general it is useful.

>>> We also should think about how we tell user space we have fallen back
>>> to a slow path. I'm sure users want to know why it works, but works
>>> much slower.
>>
>> For the general hardware acceleration of bonds, my thoughts for switchdev are:
>>
>> (Assume 802.3ad LAG setup for discussion sake, other bonding modes are similar).
>>
>> 1. The driver has access to port membership notification via netevent,
>> so driver knows which ports are in which bonds.  This notification is
>> passive; there is no way to signal to user that when port was put in a
>> bond if it was programmed into a device LAG group or not.  It's
>> totally up to the driver and device resources to make that decision.
>
> Exactly. The fact if another group can be added or not should be decided
> and handled by driver. The driver then notifies higher layers using
> switchdev notifier event.

hmm same point I need to be able to know ahead of time if it is going to
succeed or fail. I think we can extend this in two ways one add an
explicit flag to say 'add this to hardware or fail' or 'never offload
this' although for LAG setup this might be challenging because the
notification is passive as noted. The other way to do this is to
sufficient exposure of the hardware model and resources so software
can predict a priori that the LAG setup will be accelerated. I think we
need both...

My only point here is eventually management of the system is challenging
if its entirely a driver decision at least in many cases where we have
intelligent agents managing the network. But sure get the simple case
working as a start.

>
>>
>> 2. The driver can know port active status via netevent as well.  If
>> device has the port in a LAG group, then reflect the active status
>> down to device port.  Again, a passive notification.
>>
>> 3. CPU-originating egress traffic would be LAGed by bonding (or team)
>> driver.  (This is true regardless if the device LAG group was setup or
>> not).
>>
>> 4a. If device LAG group setup, forwarded traffic would be LAG egressed
>> by device (fast path).
>>
>> 4b. If no device LAG group, ingress traffic is sent to CPU (slow path)
>> for bonding (or team) to LAG egress.
>>
>> So software fall-back (4b) is the default case if driver/device can't
>> setup LAG group for forwarding.
>>
>> So the question is: how does user know which bonds are accelerated?
>> So far, we've used the label "external" to mark L2 bridge FDB which
>> are offloaded to the device and "external" to mark L3 routes offloaded
>> to the device.  Do we mark bonds as "external" if the LAG path is
>> offloaded to the device?
>
> I believe that this is the way to go. Introduce this flag for bond and
> team to signal user if LAG is offloaded or not.
>
>
>>
>> -scott
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
diff mbox

Patch

diff --git a/drivers/net/dsa/mv88e6131.c b/drivers/net/dsa/mv88e6131.c
index 2540ef0..3ba7a0c 100644
--- a/drivers/net/dsa/mv88e6131.c
+++ b/drivers/net/dsa/mv88e6131.c
@@ -382,6 +382,256 @@  mv88e6131_get_ethtool_stats(struct dsa_switch *ds,
 				    mv88e6131_hw_stats, port, data);
 }
 
+/* Trunking */
+static int mv88e6131_bond_set_trunk_learning(struct dsa_switch *ds,
+					     int *ports, size_t num)
+{
+	u16 port_vec = 0;
+	int ret;
+	int i;
+
+	num = num < MAX_PORTS ? num : MAX_PORTS;
+
+	for (i = 0; i < num; i++)
+		port_vec |= 1 << ports[i];
+
+	for (i = 0; i < num; i++) {
+		ret = mv88e6xxx_reg_read(ds, REG_PORT(ports[i]), REG_PORT_PAV);
+		if (ret < 0)
+			continue;
+		ret = (ret & 0xf800) | (port_vec & 0x7ff);
+		mv88e6xxx_reg_write(ds, REG_PORT(ports[i]), REG_PORT_PAV, ret);
+	}
+
+	return 0;
+}
+
+static int mv88e6131_bond_set_loop_prevention(struct dsa_switch *ds,
+					      int *ports, size_t num)
+{
+	u16 port_vec = 0;
+	int ret;
+	int i;
+
+	num = num < MAX_PORTS ? num : MAX_PORTS;
+
+	for (i = 0; i < num; i++)
+		port_vec |= 1 << ports[i];
+
+	for (i = 0; i < num; i++) {
+		ret = mv88e6xxx_reg_read(ds, REG_PORT(ports[i]), REG_PORT_VLAN_MAP);
+		if (ret < 0)
+			continue;
+		ret &= ~port_vec & 0x7ff;
+		mv88e6xxx_reg_write(ds, REG_PORT(ports[i]), REG_PORT_VLAN_MAP, ret);
+	}
+
+	return 0;
+}
+
+static int mv88e6131_wait_trunk_mask(struct dsa_switch *ds)
+{
+	const int max_retries = 10;
+	int retries = 0;
+	int ret;
+
+	/* Wait for update Trunk Mask data */
+	while (1) {
+		ret = REG_READ(REG_GLOBAL2, REG_TRUNK_MASK);
+		if (!(ret & 0x8000))
+			return ret;
+		if (retries > max_retries) {
+			pr_warn("mv88e6131: Timeout waiting for "
+				"Trunk Mask Table Register Update\n");
+			return -EBUSY;
+		}
+		retries++;
+		usleep_range(20, 50);
+	};
+
+	return -EPERM;
+}
+
+static int mv88e6131_get_trunk_mask(struct dsa_switch *ds, int trunk_nr, u16 *mask)
+{
+	int ret;
+
+	if (trunk_nr > 0x7)
+		return -EINVAL;
+
+	ret = mv88e6131_wait_trunk_mask(ds);
+	if (ret < 0)
+		return ret;
+
+	/* Set MaskNum */
+	ret = (ret & 0x0fff) | (trunk_nr << 12);
+	REG_WRITE(REG_GLOBAL2, REG_TRUNK_MASK, ret);
+
+	/* Get TrunkMask */
+	ret = REG_READ(REG_GLOBAL2, REG_TRUNK_MASK);
+	*mask = ret & 0x7FF;
+
+	return 0;
+}
+
+static int mv88e6131_set_trunk_mask(struct dsa_switch *ds, int trunk_nr, u16 mask)
+{
+	int ret;
+
+	if (trunk_nr > 0x7)
+		return -EINVAL;
+
+	ret = mv88e6131_wait_trunk_mask(ds);
+	if (ret < 0)
+		return ret;
+
+	/* Write TrunkMask */
+	ret = 0x8000 | (ret & 0x7800) | (trunk_nr << 12) | (mask & 0x7ff);
+	REG_WRITE(REG_GLOBAL2, REG_TRUNK_MASK, ret);
+
+	return 0;
+}
+
+static int mv88e6131_bond_set_load_balancing(struct dsa_switch *ds,
+					     int *ports, bool *attached, size_t num)
+{
+	u16 mask;
+	u16 member_mask = 0;
+	int att_ports[MAX_PORTS];
+	int att_num = 0;
+	int ret;
+	int i;
+
+	num = num < MAX_PORTS ? num : MAX_PORTS;
+
+	for (i = 0; i < num; i++) {
+		member_mask |= 1 << ports[i];
+		if (attached[i])
+			att_ports[att_num++] = ports[i];
+	}
+
+	for (i = 0; i < 8; i++) {
+		ret = mv88e6131_get_trunk_mask(ds, i, &mask);
+		if (ret < 0)
+			continue;
+		mask &= ~member_mask;
+		if (att_num)
+			mask |= 1 << att_ports[i % att_num];
+		mv88e6131_set_trunk_mask(ds, i, mask);
+	}
+
+	return 0;
+}
+
+static int mv88e6131_bond_get_ports(struct dsa_switch *ds, int gid, int *ports, bool *attached, size_t sz)
+{
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+	int num = 0;
+	int p;
+
+	for (p = 0; (p < MAX_PORTS) && (num < sz) ; p++) {
+		if (ps->bond_port[p].gid == gid) {
+			ports[num] = p;
+			attached[num] = ps->bond_port[p].attached;
+			num++;
+		}
+	}
+
+	return num;
+}
+
+static int mv88e6131_bond_setup(struct dsa_switch *ds, int gid)
+{
+	int ports[MAX_PORTS];
+	bool attached[MAX_PORTS];
+	int num;
+	int err = 0;
+
+	num = mv88e6131_bond_get_ports(ds, gid, ports, attached, MAX_PORTS);
+
+	err = mv88e6131_bond_set_trunk_learning(ds, ports, num);
+	if (err)
+		return err;
+	err = mv88e6131_bond_set_loop_prevention(ds, ports, num);
+	if (err)
+		return err;
+	err = mv88e6131_bond_set_load_balancing(ds, ports, attached, num);
+	if (err)
+		return err;
+
+	return 0;
+
+}
+
+static int mv88e6131_bond_add_group(struct dsa_switch *ds, int port, int gid)
+{
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+
+	ps->bond_port[port].gid = gid;
+	ps->bond_port[port].attached = false;
+
+	return mv88e6131_bond_setup(ds, gid);
+}
+
+static int mv88e6131_bond_del_group(struct dsa_switch *ds, int port, int gid)
+{
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+	bool btmp;
+	int ret;
+
+	ps->bond_port[port].gid = 0;
+	ps->bond_port[port].attached = false;
+	mv88e6131_bond_setup(ds, gid);
+
+	/* Reset trunk learning */
+	ret = mv88e6xxx_reg_read(ds, REG_PORT(port), REG_PORT_PAV);
+	if (ret >= 0) {
+		ret = (ret & 0xf800) | ((1 << port) & 0x7ff);
+		mv88e6xxx_reg_write(ds, REG_PORT(port), REG_PORT_PAV, ret);
+	}
+	/* Reset loop prevention  */
+	ret = mv88e6xxx_reg_read(ds, REG_PORT(port), REG_PORT_VLAN_MAP);
+	if (ret >= 0) {
+		ret = (ret & 0xf800) | ((1 << dsa_upstream_port(ds)) & 0x7ff);
+		mv88e6xxx_reg_write(ds, REG_PORT(port), REG_PORT_VLAN_MAP, ret);
+	}
+	/* Reset load balancing */
+	btmp = true;
+	mv88e6131_bond_set_load_balancing(ds, &port, &btmp, 1);
+
+	return 0;
+}
+
+static int mv88e6131_bond_attach(struct dsa_switch *ds, int port)
+{
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+	int ports[MAX_PORTS];
+	bool attached[MAX_PORTS];
+	int gid = ps->bond_port[port].gid;
+	int num;
+
+	ps->bond_port[port].attached = true;
+	num = mv88e6131_bond_get_ports(ds, gid, ports, attached, MAX_PORTS);
+
+	return mv88e6131_bond_set_load_balancing(ds, ports, attached, num);
+}
+
+static int mv88e6131_bond_detach(struct dsa_switch *ds, int port)
+{
+	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+	int ports[MAX_PORTS];
+	bool attached[MAX_PORTS];
+	int gid = ps->bond_port[port].gid;
+	int num;
+
+	ps->bond_port[port].attached = false;
+	num = mv88e6131_bond_get_ports(ds, gid, ports, attached, MAX_PORTS);
+
+	return mv88e6131_bond_set_load_balancing(ds, ports, attached, num);
+}
+
+
+
 static int mv88e6131_get_sset_count(struct dsa_switch *ds)
 {
 	return ARRAY_SIZE(mv88e6131_hw_stats);
@@ -399,6 +649,10 @@  struct dsa_switch_driver mv88e6131_switch_driver = {
 	.get_strings		= mv88e6131_get_strings,
 	.get_ethtool_stats	= mv88e6131_get_ethtool_stats,
 	.get_sset_count		= mv88e6131_get_sset_count,
+	.bond_add_group		= mv88e6131_bond_add_group,
+	.bond_del_group		= mv88e6131_bond_del_group,
+	.bond_attach		= mv88e6131_bond_attach,
+	.bond_detach		= mv88e6131_bond_detach,
 };
 
 MODULE_ALIAS("platform:mv88e6085");
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 7294227..383b224 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -11,9 +11,19 @@ 
 #ifndef __MV88E6XXX_H
 #define __MV88E6XXX_H
 
+#define MAX_PORTS		11
+
 #define REG_PORT(p)		(0x10 + (p))
+#define REG_PORT_VLAN_MAP	0x6
+#define REG_PORT_PAV		0xb
 #define REG_GLOBAL		0x1b
 #define REG_GLOBAL2		0x1c
+#define REG_TRUNK_MASK		0x7
+
+struct mv88e6xxx_bond_port {
+	int gid;
+	bool attached;
+};
 
 struct mv88e6xxx_priv_state {
 	/* When using multi-chip addressing, this mutex protects
@@ -49,6 +59,10 @@  struct mv88e6xxx_priv_state {
 	struct mutex eeprom_mutex;
 
 	int		id; /* switch product id */
+
+	/* Contains bonding info for each port
+	 */
+	struct mv88e6xxx_bond_port	bond_port[MAX_PORTS];
 };
 
 struct mv88e6xxx_hw_stat {