diff mbox

[1/4,net-next] net: phy: add Generic Netlink Ethernet switch configuration API

Message ID 1382466229-15123-2-git-send-email-f.fainelli@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Florian Fainelli Oct. 22, 2013, 6:23 p.m. UTC
This patch adds an Ethernet Switch generic netlink configuration API
which allows for doing the required configuration of managed Ethernet
switches commonly found in Wireless/Cable/DSL routers in the market.

Since this API is based on the Generic Netlink infrastructure it is very
easy to extend a particular switch driver to support additional features
and to adapt it to specific switches.

So far the API includes support for:

- getting/setting a port VLAN id
- getting/setting VLAN port membership
- getting a port link status
- getting a port statistics counters
- resetting a switch device
- applying a configuration to a switch device

Unlike the Distributed Switch Architecture code, this API is much
smaller and does not interfere with the networking stack packet flow, but
rather focuses on the control path of managed switches.

A concrete example of a switch driver is included in subsequent patches
to illustrate how it can be used as well as the required user-space
controlling tools.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: John Crispin <blogic@openwrt.org>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 Documentation/networking/swconfig.txt |  162 +++++
 MAINTAINERS                           |   10 +
 drivers/net/phy/Kconfig               |    6 +
 drivers/net/phy/Makefile              |    1 +
 drivers/net/phy/swconfig.c            | 1078 +++++++++++++++++++++++++++++++++
 include/linux/swconfig.h              |  180 ++++++
 include/uapi/linux/Kbuild             |    1 +
 include/uapi/linux/swconfig.h         |  103 ++++
 8 files changed, 1541 insertions(+)
 create mode 100644 Documentation/networking/swconfig.txt
 create mode 100644 drivers/net/phy/swconfig.c
 create mode 100644 include/linux/swconfig.h
 create mode 100644 include/uapi/linux/swconfig.h

Comments

Dan Williams Oct. 22, 2013, 7:22 p.m. UTC | #1
On Tue, 2013-10-22 at 11:23 -0700, Florian Fainelli wrote:
> This patch adds an Ethernet Switch generic netlink configuration API
> which allows for doing the required configuration of managed Ethernet
> switches commonly found in Wireless/Cable/DSL routers in the market.

"swconfig" probably means "switch config", but is there any way to
rename this away from the "sw" prefix, since "sw" typically means
"software" and not "switch"?

Dan

> Since this API is based on the Generic Netlink infrastructure it is very
> easy to extend a particular switch driver to support additional features
> and to adapt it to specific switches.
> 
> So far the API includes support for:
> 
> - getting/setting a port VLAN id
> - getting/setting VLAN port membership
> - getting a port link status
> - getting a port statistics counters
> - resetting a switch device
> - applying a configuration to a switch device
> 
> Unlike the Distributed Switch Architecture code, this API is much
> smaller and does not interfere with the networking stack packet flow, but
> rather focuses on the control path of managed switches.
> 
> A concrete example of a switch driver is included in subsequent patches
> to illustrate how it can be used as well as the required user-space
> controlling tools.
> 
> Signed-off-by: Felix Fietkau <nbd@openwrt.org>
> Signed-off-by: John Crispin <blogic@openwrt.org>
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  Documentation/networking/swconfig.txt |  162 +++++
>  MAINTAINERS                           |   10 +
>  drivers/net/phy/Kconfig               |    6 +
>  drivers/net/phy/Makefile              |    1 +
>  drivers/net/phy/swconfig.c            | 1078 +++++++++++++++++++++++++++++++++
>  include/linux/swconfig.h              |  180 ++++++
>  include/uapi/linux/Kbuild             |    1 +
>  include/uapi/linux/swconfig.h         |  103 ++++
>  8 files changed, 1541 insertions(+)
>  create mode 100644 Documentation/networking/swconfig.txt
>  create mode 100644 drivers/net/phy/swconfig.c
>  create mode 100644 include/linux/swconfig.h
>  create mode 100644 include/uapi/linux/swconfig.h
> 
> diff --git a/Documentation/networking/swconfig.txt b/Documentation/networking/swconfig.txt
> new file mode 100644
> index 0000000..f560066
> --- /dev/null
> +++ b/Documentation/networking/swconfig.txt
> @@ -0,0 +1,162 @@
> +Generic Netlink Switch configuration API
> +
> +Introduction
> +============
> +
> +The following documentation covers the Linux Ethernet switch configuration API
> +which is based on the Generic Netlink infrastructure.
> +
> +Scope and rationale
> +===================
> +
> +Most Ethernet switches found in small routers are managed switches which allow
> +the following operations:
> +
> +- configure a port to belong to a particular set of VLANs either as tagged or
> +  untagged
> +- configure a particular port to advertise specific link/speed/duplex settings
> +- collect statistics about the number of packets/bytes transferred/received
> +- any other vendor specific feature: rate limiting, single/double tagging...
> +
> +Such switches can be connected to the controlling CPU using different hardware
> +busses, but most commonly:
> +
> +- SPI/I2C/GPIO bitbanging
> +- MDIO
> +- Memory mapped into the CPU register address space
> +
> +As of today the usual way to configure such a switch was either to write a
> +specific driver or to write an user-space application which would have to know
> +about the hardware differences and figure out a way to access the switch
> +registers (spidev, SIOCIGGMIIREG, mmap...) from user-space.
> +
> +This has multiple issues:
> +
> +- proliferation of ad-hoc solutions to configure a switch both open source and
> +  proprietary
> +
> +- absence of common software reference for switches commonly found on the market
> +  (Broadcom, Lantiq/Infineon/ADMTek, Marvell, Qualcomm/Atheros...) which implies
> +  a duplication effort for each implementer
> +
> +- inability to leverage existing hardware representation mechanisms such as
> +  Device Tree (spidev, i2c-dev.. do not belong in Device Tree and rely on
> +  Linux-specific "forwarder" drivers) to describe a switch device
> +
> +The goal of the switch configuration API is to provide a common basis to build
> +re-usable and extensible switch drivers with the following ideas in mind:
> +
> +- having a central point of configuration on top of which a reference user-space
> +  implementation can be provided but also allow for other user-space
> +  implementations to exist
> +
> +- ensure the Linux kernel is in control of the actual hardware access
> +
> +- be extensible enough to support per-switch features without making the generic
> +  implementation too heavy weighted and without making user-space changes each
> +  and every time a new feature is added
> +
> +Based on these design goals the Generic Netlink kernel/user-space communication
> +mechanism was chosen because it allows for all design goals to be met.
> +
> +Distributed Switch Architecture vs. swconfig
> +============================================
> +
> +The Marvell Distributed Switch Architecture drivers is an existing solution
> +which is a heavy switch driver infrastructure, is Marvell centric, only
> +supports MDIO connected switches, mangles an Ethernet driver transmit/receive
> +paths and does not offer a central control path for the user.
> +
> +swconfig is vendor agnostic, does not mangle the transmit/receive path
> +of an Ethernet driver and is focused on the control path of the switch rather
> +that the data path. It is based on Generic Netlink to allow for each switch
> +driver to easily extend the swconfig API without causing major core parts rework
> +each and every time someone has a specific feature to implement and offers a
> +central configuration point with a well-defined API.
> +
> +Switch configuration API
> +========================
> +
> +The main data structure of the switch configuration API is a "struct switch_dev"
> +which contains the following members:
> +
> +- a set of common operations to all switches (struct switch_dev_ops)
> +- a network device pointer it is physically attached to
> +- a number of physical switch ports (including CPU port)
> +- a set of configured vlans
> +- a CPU specific port index
> +
> +A particular switch device is registered/unregistered using the following pair
> +of functions:
> +
> +register_switch(struct switch_dev *sw_dev, struct net_device *dev);
> +unregister_switch(struct switch_dev);
> +
> +A given switch driver can be backed by any kind of underlying bus driver (i2c
> +client, GPIO driver, MMIO driver, directly into the Ethernet MAC driver...).
> +
> +The set of common operations to all switches is represented by the "struct
> +switch_dev_ops" function pointers, these common operations are defined as such:
> +
> +- get the port list of a VLAN identifier
> +- set the port list of a VLAN identifier
> +- get the primary VLAN identifier of a port
> +- set the primary VLAN identifier of a port
> +- apply the changed configuration to the switch
> +- reset the switch
> +- get a port link status
> +- get a port statistics counters
> +
> +The switch_dev_ops structure also contains an extensible way of representing and
> +querying switch specific features, 3 different types of attributes are
> +available:
> +
> +- global attributes: attributes global to a switch (name, identifier, number of
> +  ports)
> +- port attributes: per-port specific attributes (MIB counters, enabling port
> +  mirroring...)
> +- vlan attributes: per-VLAN specific attributes (VLAN id, specific VLAN
> +  information)
> +
> +Each of these 3 categories must be represented using an array of "struct
> +switch_attr" attributes. This structure must be filed with:
> +
> +- an unique name for the operation
> +- a description for the operation
> +- a setter operation
> +- a getter operation
> +- a data type (string, integer, port)
> +- eventual min/max limits to validate user input data
> +
> +The "struct switch_attr" directly maps to a Generic Netlink type of command and
> +will be automatically discovered by the "swconfig" user-space utility without
> +requiring user-space changes.
> +
> +User-space reference tool
> +=========================
> +
> +A reference user-space implementation is provided in tools/swconfig in order to
> +directly configure and use a particular switch driver. This reference
> +implementation is linking against libnl-1 for the moment.
> +
> +To build it:
> +
> +make -C tools/swconfig
> +
> +To list the available switches:
> +
> +./tools/swconfig list
> +
> +And to show a particular switch configuration for instance:
> +
> +./tools/swconfig dev eth0 show
> +
> +Fake (simulation) switch driver
> +===============================
> +
> +A fake switch driver called swconfig-hwsim is provided in order to allow for
> +easy testing of API changes and to perform regression testing. This driver will
> +automatically map to the loopback device and will create a fake switch of up to
> +8 Gigabit ports. Each of these ports can be configured with separate
> +speed/duplex/link settings. This driver is gated with the CONFIG_SWCONFIG_HWSIM
> +configuration symbol.
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f169259..3a54262 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8117,6 +8117,16 @@ F:	lib/swiotlb.c
>  F:	arch/*/kernel/pci-swiotlb.c
>  F:	include/linux/swiotlb.h
>  
> +SWITCH CONFIGURATION API
> +M:	Florian Fainelli <f.fainelli@gmail.com>
> +L:	openwrt-devel@lists.openwrt.org
> +L:	netdev@vger.kernel.org
> +S:	Supported
> +F:	drivers/net/ethernet/phy/swconfig*.c
> +F:	include/uapi/linux/switch.h
> +F:	include/linux/switch.h
> +F:	Documentation/networking/swconfig.txt
> +
>  SYNOPSYS ARC ARCHITECTURE
>  M:	Vineet Gupta <vgupta@synopsys.com>
>  S:	Supported
> diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
> index 342561a..9b3e117 100644
> --- a/drivers/net/phy/Kconfig
> +++ b/drivers/net/phy/Kconfig
> @@ -12,6 +12,12 @@ menuconfig PHYLIB
>  
>  if PHYLIB
>  
> +config SWCONFIG
> +	tristate "Switch configuration API"
> +	---help---
> +	  Switch configuration API using netlink. This allows
> +	  you to configure the VLAN features of certain switches.
> +
>  comment "MII PHY device drivers"
>  
>  config AT803X_PHY
> diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
> index 23a2ab2..268c7de 100644
> --- a/drivers/net/phy/Makefile
> +++ b/drivers/net/phy/Makefile
> @@ -3,6 +3,7 @@
>  libphy-objs			:= phy.o phy_device.o mdio_bus.o
>  
>  obj-$(CONFIG_PHYLIB)		+= libphy.o
> +obj-$(CONFIG_SWCONFIG)		+= swconfig.o
>  obj-$(CONFIG_MARVELL_PHY)	+= marvell.o
>  obj-$(CONFIG_DAVICOM_PHY)	+= davicom.o
>  obj-$(CONFIG_CICADA_PHY)	+= cicada.o
> diff --git a/drivers/net/phy/swconfig.c b/drivers/net/phy/swconfig.c
> new file mode 100644
> index 0000000..9997c35
> --- /dev/null
> +++ b/drivers/net/phy/swconfig.c
> @@ -0,0 +1,1078 @@
> +/*
> + * Switch configuration API
> + *
> + * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/types.h>
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/list.h>
> +#include <linux/if.h>
> +#include <linux/if_ether.h>
> +#include <linux/capability.h>
> +#include <linux/skbuff.h>
> +#include <linux/swconfig.h>
> +
> +#define SWCONFIG_DEVNAME	"switch%d"
> +
> +MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
> +MODULE_LICENSE("GPL");
> +
> +static int swdev_id;
> +static struct list_head swdevs;
> +static DEFINE_SPINLOCK(swdevs_lock);
> +struct swconfig_callback;
> +
> +struct swconfig_callback {
> +	struct sk_buff *msg;
> +	struct genlmsghdr *hdr;
> +	struct genl_info *info;
> +	int cmd;
> +
> +	/* callback for filling in the message data */
> +	int (*fill)(struct swconfig_callback *cb, void *arg);
> +
> +	/* callback for closing the message before sending it */
> +	int (*close)(struct swconfig_callback *cb, void *arg);
> +
> +	struct nlattr *nest[4];
> +	int args[4];
> +};
> +
> +/* defaults */
> +
> +static int
> +swconfig_get_vlan_ports(struct switch_dev *dev,
> +			const struct switch_attr *attr, struct switch_val *val)
> +{
> +	int ret;
> +	if (val->port_vlan >= dev->vlans)
> +		return -EINVAL;
> +
> +	if (!dev->ops->get_vlan_ports)
> +		return -EOPNOTSUPP;
> +
> +	ret = dev->ops->get_vlan_ports(dev, val);
> +	return ret;
> +}
> +
> +static int
> +swconfig_set_vlan_ports(struct switch_dev *dev,
> +			const struct switch_attr *attr, struct switch_val *val)
> +{
> +	struct switch_port *ports = val->value.ports;
> +	const struct switch_dev_ops *ops = dev->ops;
> +	int i;
> +
> +	if (val->port_vlan >= dev->vlans)
> +		return -EINVAL;
> +
> +	/* validate ports */
> +	if (val->len > dev->ports)
> +		return -EINVAL;
> +
> +	if (!ops->set_vlan_ports)
> +		return -EOPNOTSUPP;
> +
> +	for (i = 0; i < val->len; i++) {
> +		if (ports[i].id >= dev->ports)
> +			return -EINVAL;
> +
> +		if (ops->set_port_pvid &&
> +		    !(ports[i].flags & (1 << SWITCH_PORT_FLAG_TAGGED)))
> +			ops->set_port_pvid(dev, ports[i].id, val->port_vlan);
> +	}
> +
> +	return ops->set_vlan_ports(dev, val);
> +}
> +
> +static int
> +swconfig_set_pvid(struct switch_dev *dev,
> +			const struct switch_attr *attr, struct switch_val *val)
> +{
> +	if (val->port_vlan >= dev->ports)
> +		return -EINVAL;
> +
> +	if (!dev->ops->set_port_pvid)
> +		return -EOPNOTSUPP;
> +
> +	return dev->ops->set_port_pvid(dev, val->port_vlan, val->value.i);
> +}
> +
> +static int
> +swconfig_get_pvid(struct switch_dev *dev,
> +			const struct switch_attr *attr, struct switch_val *val)
> +{
> +	if (val->port_vlan >= dev->ports)
> +		return -EINVAL;
> +
> +	if (!dev->ops->get_port_pvid)
> +		return -EOPNOTSUPP;
> +
> +	return dev->ops->get_port_pvid(dev, val->port_vlan, &val->value.i);
> +}
> +
> +static const char *
> +swconfig_speed_str(enum switch_port_speed speed)
> +{
> +	switch (speed) {
> +	case SWITCH_PORT_SPEED_10:
> +		return "10baseT";
> +	case SWITCH_PORT_SPEED_100:
> +		return "100baseT";
> +	case SWITCH_PORT_SPEED_1000:
> +		return "1000baseT";
> +	default:
> +		break;
> +	}
> +
> +	return "unknown";
> +}
> +
> +static int
> +swconfig_get_link(struct switch_dev *dev,
> +			const struct switch_attr *attr, struct switch_val *val)
> +{
> +	struct switch_port_link link;
> +	int len;
> +	int ret;
> +
> +	if (val->port_vlan >= dev->ports)
> +		return -EINVAL;
> +
> +	if (!dev->ops->get_port_link)
> +		return -EOPNOTSUPP;
> +
> +	memset(&link, 0, sizeof(link));
> +	ret = dev->ops->get_port_link(dev, val->port_vlan, &link);
> +	if (ret)
> +		return ret;
> +
> +	memset(dev->buf, 0, sizeof(dev->buf));
> +
> +	if (link.link)
> +		len = snprintf(dev->buf, sizeof(dev->buf),
> +			       "port:%d link:up speed:%s %s-duplex %s%s%s",
> +			       val->port_vlan,
> +			       swconfig_speed_str(link.speed),
> +			       link.duplex ? "full" : "half",
> +			       link.tx_flow ? "txflow " : "",
> +			       link.rx_flow ?	"rxflow " : "",
> +			       link.aneg ? "auto" : "");
> +	else
> +		len = snprintf(dev->buf, sizeof(dev->buf), "port:%d link:down",
> +			       val->port_vlan);
> +
> +	val->value.s = dev->buf;
> +	val->len = len;
> +
> +	return 0;
> +}
> +
> +static int
> +swconfig_apply_config(struct switch_dev *dev,
> +			const struct switch_attr *attr, struct switch_val *val)
> +{
> +	/* don't complain if not supported by the switch driver */
> +	if (!dev->ops->apply_config)
> +		return 0;
> +
> +	return dev->ops->apply_config(dev);
> +}
> +
> +static int
> +swconfig_reset_switch(struct switch_dev *dev,
> +			const struct switch_attr *attr, struct switch_val *val)
> +{
> +	/* don't complain if not supported by the switch driver */
> +	if (!dev->ops->reset_switch)
> +		return 0;
> +
> +	return dev->ops->reset_switch(dev);
> +}
> +
> +enum global_defaults {
> +	GLOBAL_APPLY,
> +	GLOBAL_RESET,
> +};
> +
> +enum vlan_defaults {
> +	VLAN_PORTS,
> +};
> +
> +enum port_defaults {
> +	PORT_PVID,
> +	PORT_LINK,
> +};
> +
> +static struct switch_attr default_global[] = {
> +	[GLOBAL_APPLY] = {
> +		.type = SWITCH_TYPE_NOVAL,
> +		.name = "apply",
> +		.description = "Activate changes in the hardware",
> +		.set = swconfig_apply_config,
> +	},
> +	[GLOBAL_RESET] = {
> +		.type = SWITCH_TYPE_NOVAL,
> +		.name = "reset",
> +		.description = "Reset the switch",
> +		.set = swconfig_reset_switch,
> +	}
> +};
> +
> +static struct switch_attr default_port[] = {
> +	[PORT_PVID] = {
> +		.type = SWITCH_TYPE_INT,
> +		.name = "pvid",
> +		.description = "Primary VLAN ID",
> +		.set = swconfig_set_pvid,
> +		.get = swconfig_get_pvid,
> +	},
> +	[PORT_LINK] = {
> +		.type = SWITCH_TYPE_STRING,
> +		.name = "link",
> +		.description = "Get port link information",
> +		.set = NULL,
> +		.get = swconfig_get_link,
> +	}
> +};
> +
> +static struct switch_attr default_vlan[] = {
> +	[VLAN_PORTS] = {
> +		.type = SWITCH_TYPE_PORTS,
> +		.name = "ports",
> +		.description = "VLAN port mapping",
> +		.set = swconfig_set_vlan_ports,
> +		.get = swconfig_get_vlan_ports,
> +	},
> +};
> +
> +static const struct switch_attr *
> +swconfig_find_attr_by_name(const struct switch_attrlist *alist,
> +				const char *name)
> +{
> +	int i;
> +
> +	for (i = 0; i < alist->n_attr; i++)
> +		if (strcmp(name, alist->attr[i].name) == 0)
> +			return &alist->attr[i];
> +
> +	return NULL;
> +}
> +
> +static void swconfig_defaults_init(struct switch_dev *dev)
> +{
> +	const struct switch_dev_ops *ops = dev->ops;
> +
> +	dev->def_global = 0;
> +	dev->def_vlan = 0;
> +	dev->def_port = 0;
> +
> +	if (ops->get_vlan_ports || ops->set_vlan_ports)
> +		set_bit(VLAN_PORTS, &dev->def_vlan);
> +
> +	if (ops->get_port_pvid || ops->set_port_pvid)
> +		set_bit(PORT_PVID, &dev->def_port);
> +
> +	if (ops->get_port_link &&
> +	    !swconfig_find_attr_by_name(&ops->attr_port, "link"))
> +		set_bit(PORT_LINK, &dev->def_port);
> +
> +	/* always present, can be no-op */
> +	set_bit(GLOBAL_APPLY, &dev->def_global);
> +	set_bit(GLOBAL_RESET, &dev->def_global);
> +}
> +
> +
> +static struct genl_family switch_fam = {
> +	.id = GENL_ID_GENERATE,
> +	.name = "switch",
> +	.hdrsize = 0,
> +	.version = 1,
> +	.maxattr = SWITCH_ATTR_MAX,
> +};
> +
> +static const struct nla_policy switch_policy[SWITCH_ATTR_MAX+1] = {
> +	[SWITCH_ATTR_ID] = { .type = NLA_U32 },
> +	[SWITCH_ATTR_OP_ID] = { .type = NLA_U32 },
> +	[SWITCH_ATTR_OP_PORT] = { .type = NLA_U32 },
> +	[SWITCH_ATTR_OP_VLAN] = { .type = NLA_U32 },
> +	[SWITCH_ATTR_OP_VALUE_INT] = { .type = NLA_U32 },
> +	[SWITCH_ATTR_OP_VALUE_STR] = { .type = NLA_NUL_STRING },
> +	[SWITCH_ATTR_OP_VALUE_PORTS] = { .type = NLA_NESTED },
> +	[SWITCH_ATTR_TYPE] = { .type = NLA_U32 },
> +};
> +
> +static const struct nla_policy port_policy[SWITCH_PORT_ATTR_MAX+1] = {
> +	[SWITCH_PORT_ID] = { .type = NLA_U32 },
> +	[SWITCH_PORT_FLAG_TAGGED] = { .type = NLA_FLAG },
> +};
> +
> +static inline void
> +swconfig_lock(void)
> +{
> +	spin_lock(&swdevs_lock);
> +}
> +
> +static inline void
> +swconfig_unlock(void)
> +{
> +	spin_unlock(&swdevs_lock);
> +}
> +
> +static struct switch_dev *
> +swconfig_get_dev(struct genl_info *info)
> +{
> +	struct switch_dev *dev = NULL;
> +	struct switch_dev *p;
> +	int id;
> +
> +	if (!info->attrs[SWITCH_ATTR_ID])
> +		goto done;
> +
> +	id = nla_get_u32(info->attrs[SWITCH_ATTR_ID]);
> +	swconfig_lock();
> +	list_for_each_entry(p, &swdevs, dev_list) {
> +		if (id != p->id)
> +			continue;
> +
> +		dev = p;
> +		break;
> +	}
> +	if (dev)
> +		mutex_lock(&dev->sw_mutex);
> +	else
> +		pr_debug("device %d not found\n", id);
> +	swconfig_unlock();
> +done:
> +	return dev;
> +}
> +
> +static inline void
> +swconfig_put_dev(struct switch_dev *dev)
> +{
> +	mutex_unlock(&dev->sw_mutex);
> +}
> +
> +static int
> +swconfig_dump_attr(struct swconfig_callback *cb, void *arg)
> +{
> +	struct switch_attr *op = arg;
> +	struct genl_info *info = cb->info;
> +	struct sk_buff *msg = cb->msg;
> +	int id = cb->args[0];
> +	void *hdr;
> +
> +	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &switch_fam,
> +			NLM_F_MULTI, SWITCH_CMD_NEW_ATTR);
> +	if (IS_ERR(hdr))
> +		return -1;
> +
> +	if (nla_put_u32(msg, SWITCH_ATTR_OP_ID, id))
> +		goto nla_put_failure;
> +	if (nla_put_u32(msg, SWITCH_ATTR_OP_TYPE, op->type))
> +		goto nla_put_failure;
> +	if (nla_put_string(msg, SWITCH_ATTR_OP_NAME, op->name))
> +		goto nla_put_failure;
> +	if (op->description)
> +		if (nla_put_string(msg, SWITCH_ATTR_OP_DESCRIPTION,
> +			op->description))
> +			goto nla_put_failure;
> +
> +	return genlmsg_end(msg, hdr);
> +nla_put_failure:
> +	genlmsg_cancel(msg, hdr);
> +	return -EMSGSIZE;
> +}
> +
> +/* spread multipart messages across multiple message buffers */
> +static int
> +swconfig_send_multipart(struct swconfig_callback *cb, void *arg)
> +{
> +	struct genl_info *info = cb->info;
> +	int restart = 0;
> +	int err;
> +
> +	do {
> +		if (!cb->msg) {
> +			cb->msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
> +			if (cb->msg == NULL)
> +				goto error;
> +		}
> +
> +		if (!(cb->fill(cb, arg) < 0))
> +			break;
> +
> +		/* fill failed, check if this was already the second attempt */
> +		if (restart)
> +			goto error;
> +
> +		/* try again in a new message, send the current one */
> +		restart = 1;
> +		if (cb->close) {
> +			if (cb->close(cb, arg) < 0)
> +				goto error;
> +		}
> +		err = genlmsg_reply(cb->msg, info);
> +		cb->msg = NULL;
> +		if (err < 0)
> +			goto error;
> +
> +	} while (restart);
> +
> +	return 0;
> +
> +error:
> +	if (cb->msg)
> +		nlmsg_free(cb->msg);
> +	return -1;
> +}
> +
> +static int
> +swconfig_list_attrs(struct sk_buff *skb, struct genl_info *info)
> +{
> +	struct genlmsghdr *hdr = nlmsg_data(info->nlhdr);
> +	const struct switch_attrlist *alist;
> +	struct switch_dev *dev;
> +	struct swconfig_callback cb;
> +	int err = -EINVAL;
> +	int i;
> +
> +	/* defaults */
> +	struct switch_attr *def_list;
> +	unsigned long *def_active;
> +	int n_def;
> +
> +	dev = swconfig_get_dev(info);
> +	if (!dev)
> +		return -EINVAL;
> +
> +	switch (hdr->cmd) {
> +	case SWITCH_CMD_LIST_GLOBAL:
> +		alist = &dev->ops->attr_global;
> +		def_list = default_global;
> +		def_active = &dev->def_global;
> +		n_def = ARRAY_SIZE(default_global);
> +		break;
> +	case SWITCH_CMD_LIST_VLAN:
> +		alist = &dev->ops->attr_vlan;
> +		def_list = default_vlan;
> +		def_active = &dev->def_vlan;
> +		n_def = ARRAY_SIZE(default_vlan);
> +		break;
> +	case SWITCH_CMD_LIST_PORT:
> +		alist = &dev->ops->attr_port;
> +		def_list = default_port;
> +		def_active = &dev->def_port;
> +		n_def = ARRAY_SIZE(default_port);
> +		break;
> +	default:
> +		WARN_ON(1);
> +		goto out;
> +	}
> +
> +	memset(&cb, 0, sizeof(cb));
> +	cb.info = info;
> +	cb.fill = swconfig_dump_attr;
> +	for (i = 0; i < alist->n_attr; i++) {
> +		if (alist->attr[i].disabled)
> +			continue;
> +		cb.args[0] = i;
> +		err = swconfig_send_multipart(&cb, (void *) &alist->attr[i]);
> +		if (err < 0)
> +			goto error;
> +	}
> +
> +	/* defaults */
> +	for (i = 0; i < n_def; i++) {
> +		if (!test_bit(i, def_active))
> +			continue;
> +		cb.args[0] = SWITCH_ATTR_DEFAULTS_OFFSET + i;
> +		err = swconfig_send_multipart(&cb, (void *) &def_list[i]);
> +		if (err < 0)
> +			goto error;
> +	}
> +	swconfig_put_dev(dev);
> +
> +	if (!cb.msg)
> +		return 0;
> +
> +	return genlmsg_reply(cb.msg, info);
> +
> +error:
> +	if (cb.msg)
> +		nlmsg_free(cb.msg);
> +out:
> +	swconfig_put_dev(dev);
> +	return err;
> +}
> +
> +static const struct switch_attr *
> +swconfig_lookup_attr(struct switch_dev *dev, struct genl_info *info,
> +		struct switch_val *val)
> +{
> +	struct genlmsghdr *hdr = nlmsg_data(info->nlhdr);
> +	const struct switch_attrlist *alist;
> +	const struct switch_attr *attr = NULL;
> +	int attr_id;
> +
> +	/* defaults */
> +	struct switch_attr *def_list;
> +	unsigned long *def_active;
> +	int n_def;
> +
> +	if (!info->attrs[SWITCH_ATTR_OP_ID])
> +		goto done;
> +
> +	switch (hdr->cmd) {
> +	case SWITCH_CMD_SET_GLOBAL:
> +	case SWITCH_CMD_GET_GLOBAL:
> +		alist = &dev->ops->attr_global;
> +		def_list = default_global;
> +		def_active = &dev->def_global;
> +		n_def = ARRAY_SIZE(default_global);
> +		break;
> +	case SWITCH_CMD_SET_VLAN:
> +	case SWITCH_CMD_GET_VLAN:
> +		alist = &dev->ops->attr_vlan;
> +		def_list = default_vlan;
> +		def_active = &dev->def_vlan;
> +		n_def = ARRAY_SIZE(default_vlan);
> +		if (!info->attrs[SWITCH_ATTR_OP_VLAN])
> +			goto done;
> +		val->port_vlan = nla_get_u32(info->attrs[SWITCH_ATTR_OP_VLAN]);
> +		if (val->port_vlan >= dev->vlans)
> +			goto done;
> +		break;
> +	case SWITCH_CMD_SET_PORT:
> +	case SWITCH_CMD_GET_PORT:
> +		alist = &dev->ops->attr_port;
> +		def_list = default_port;
> +		def_active = &dev->def_port;
> +		n_def = ARRAY_SIZE(default_port);
> +		if (!info->attrs[SWITCH_ATTR_OP_PORT])
> +			goto done;
> +		val->port_vlan = nla_get_u32(info->attrs[SWITCH_ATTR_OP_PORT]);
> +		if (val->port_vlan >= dev->ports)
> +			goto done;
> +		break;
> +	default:
> +		WARN_ON(1);
> +		goto done;
> +	}
> +
> +	if (!alist)
> +		goto done;
> +
> +	attr_id = nla_get_u32(info->attrs[SWITCH_ATTR_OP_ID]);
> +	if (attr_id >= SWITCH_ATTR_DEFAULTS_OFFSET) {
> +		attr_id -= SWITCH_ATTR_DEFAULTS_OFFSET;
> +		if (attr_id >= n_def)
> +			goto done;
> +		if (!test_bit(attr_id, def_active))
> +			goto done;
> +		attr = &def_list[attr_id];
> +	} else {
> +		if (attr_id >= alist->n_attr)
> +			goto done;
> +		attr = &alist->attr[attr_id];
> +	}
> +
> +	if (attr->disabled)
> +		attr = NULL;
> +
> +done:
> +	if (!attr)
> +		pr_debug("attribute lookup failed\n");
> +	val->attr = attr;
> +	return attr;
> +}
> +
> +static int
> +swconfig_parse_ports(struct sk_buff *msg, struct nlattr *head,
> +		struct switch_val *val, int max)
> +{
> +	struct nlattr *nla;
> +	int rem;
> +
> +	val->len = 0;
> +	nla_for_each_nested(nla, head, rem) {
> +		struct nlattr *tb[SWITCH_PORT_ATTR_MAX+1];
> +		struct switch_port *port = &val->value.ports[val->len];
> +
> +		if (val->len >= max)
> +			return -EINVAL;
> +
> +		if (nla_parse_nested(tb, SWITCH_PORT_ATTR_MAX, nla,
> +				port_policy))
> +			return -EINVAL;
> +
> +		if (!tb[SWITCH_PORT_ID])
> +			return -EINVAL;
> +
> +		port->id = nla_get_u32(tb[SWITCH_PORT_ID]);
> +		if (tb[SWITCH_PORT_FLAG_TAGGED])
> +			port->flags |= (1 << SWITCH_PORT_FLAG_TAGGED);
> +		val->len++;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +swconfig_set_attr(struct sk_buff *skb, struct genl_info *info)
> +{
> +	const struct switch_attr *attr;
> +	struct switch_dev *dev;
> +	struct switch_val val;
> +	int err = -EINVAL;
> +
> +	dev = swconfig_get_dev(info);
> +	if (!dev)
> +		return -EINVAL;
> +
> +	memset(&val, 0, sizeof(val));
> +	attr = swconfig_lookup_attr(dev, info, &val);
> +	if (!attr || !attr->set)
> +		goto error;
> +
> +	val.attr = attr;
> +	switch (attr->type) {
> +	case SWITCH_TYPE_NOVAL:
> +		break;
> +	case SWITCH_TYPE_INT:
> +		if (!info->attrs[SWITCH_ATTR_OP_VALUE_INT])
> +			goto error;
> +		val.value.i =
> +			nla_get_u32(info->attrs[SWITCH_ATTR_OP_VALUE_INT]);
> +		break;
> +	case SWITCH_TYPE_STRING:
> +		if (!info->attrs[SWITCH_ATTR_OP_VALUE_STR])
> +			goto error;
> +		val.value.s =
> +			nla_data(info->attrs[SWITCH_ATTR_OP_VALUE_STR]);
> +		break;
> +	case SWITCH_TYPE_PORTS:
> +		val.value.ports = dev->portbuf;
> +		memset(dev->portbuf, 0,
> +			sizeof(struct switch_port) * dev->ports);
> +
> +		/* TODO: implement multipart? */
> +		if (info->attrs[SWITCH_ATTR_OP_VALUE_PORTS]) {
> +			err = swconfig_parse_ports(skb,
> +				info->attrs[SWITCH_ATTR_OP_VALUE_PORTS],
> +				&val, dev->ports);
> +			if (err < 0)
> +				goto error;
> +		} else {
> +			val.len = 0;
> +			err = 0;
> +		}
> +		break;
> +	default:
> +		goto error;
> +	}
> +
> +	err = attr->set(dev, attr, &val);
> +error:
> +	swconfig_put_dev(dev);
> +	return err;
> +}
> +
> +static int
> +swconfig_close_portlist(struct swconfig_callback *cb, void *arg)
> +{
> +	if (cb->nest[0])
> +		nla_nest_end(cb->msg, cb->nest[0]);
> +	return 0;
> +}
> +
> +static int
> +swconfig_send_port(struct swconfig_callback *cb, void *arg)
> +{
> +	const struct switch_port *port = arg;
> +	struct nlattr *p = NULL;
> +
> +	if (!cb->nest[0]) {
> +		cb->nest[0] = nla_nest_start(cb->msg, cb->cmd);
> +		if (!cb->nest[0])
> +			return -1;
> +	}
> +
> +	p = nla_nest_start(cb->msg, SWITCH_ATTR_PORT);
> +	if (!p)
> +		goto error;
> +
> +	if (nla_put_u32(cb->msg, SWITCH_PORT_ID, port->id))
> +		goto nla_put_failure;
> +	if (port->flags & (1 << SWITCH_PORT_FLAG_TAGGED)) {
> +		if (nla_put_flag(cb->msg, SWITCH_PORT_FLAG_TAGGED))
> +			goto nla_put_failure;
> +	}
> +
> +	nla_nest_end(cb->msg, p);
> +	return 0;
> +
> +nla_put_failure:
> +		nla_nest_cancel(cb->msg, p);
> +error:
> +	nla_nest_cancel(cb->msg, cb->nest[0]);
> +	return -1;
> +}
> +
> +static int
> +swconfig_send_ports(struct sk_buff **msg, struct genl_info *info, int attr,
> +		const struct switch_val *val)
> +{
> +	struct swconfig_callback cb;
> +	int err = 0;
> +	int i;
> +
> +	if (!val->value.ports)
> +		return -EINVAL;
> +
> +	memset(&cb, 0, sizeof(cb));
> +	cb.cmd = attr;
> +	cb.msg = *msg;
> +	cb.info = info;
> +	cb.fill = swconfig_send_port;
> +	cb.close = swconfig_close_portlist;
> +
> +	cb.nest[0] = nla_nest_start(cb.msg, cb.cmd);
> +	for (i = 0; i < val->len; i++) {
> +		err = swconfig_send_multipart(&cb, &val->value.ports[i]);
> +		if (err)
> +			goto done;
> +	}
> +	err = val->len;
> +	swconfig_close_portlist(&cb, NULL);
> +	*msg = cb.msg;
> +
> +done:
> +	return err;
> +}
> +
> +static int
> +swconfig_get_attr(struct sk_buff *skb, struct genl_info *info)
> +{
> +	struct genlmsghdr *hdr = nlmsg_data(info->nlhdr);
> +	const struct switch_attr *attr;
> +	struct switch_dev *dev;
> +	struct sk_buff *msg = NULL;
> +	struct switch_val val;
> +	int err = -EINVAL;
> +	int cmd = hdr->cmd;
> +
> +	dev = swconfig_get_dev(info);
> +	if (!dev)
> +		return -EINVAL;
> +
> +	memset(&val, 0, sizeof(val));
> +	attr = swconfig_lookup_attr(dev, info, &val);
> +	if (!attr || !attr->get)
> +		goto error;
> +
> +	if (attr->type == SWITCH_TYPE_PORTS) {
> +		val.value.ports = dev->portbuf;
> +		memset(dev->portbuf, 0,
> +			sizeof(struct switch_port) * dev->ports);
> +	}
> +
> +	err = attr->get(dev, attr, &val);
> +	if (err)
> +		goto error;
> +
> +	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
> +	if (!msg)
> +		goto error;
> +
> +	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &switch_fam,
> +			0, cmd);
> +	if (IS_ERR(hdr))
> +		goto nla_put_failure;
> +
> +	switch (attr->type) {
> +	case SWITCH_TYPE_INT:
> +		if (nla_put_u32(msg, SWITCH_ATTR_OP_VALUE_INT, val.value.i))
> +			goto nla_put_failure;
> +		break;
> +	case SWITCH_TYPE_STRING:
> +		if (nla_put_string(msg, SWITCH_ATTR_OP_VALUE_STR, val.value.s))
> +			goto nla_put_failure;
> +		break;
> +	case SWITCH_TYPE_PORTS:
> +		err = swconfig_send_ports(&msg, info,
> +				SWITCH_ATTR_OP_VALUE_PORTS, &val);
> +		if (err < 0)
> +			goto nla_put_failure;
> +		break;
> +	default:
> +		pr_debug("invalid type in attribute\n");
> +		err = -EINVAL;
> +		goto error;
> +	}
> +	err = genlmsg_end(msg, hdr);
> +	if (err < 0)
> +		goto nla_put_failure;
> +
> +	swconfig_put_dev(dev);
> +	return genlmsg_reply(msg, info);
> +
> +nla_put_failure:
> +	if (msg)
> +		nlmsg_free(msg);
> +error:
> +	swconfig_put_dev(dev);
> +	if (!err)
> +		err = -ENOMEM;
> +	return err;
> +}
> +
> +static int
> +swconfig_send_switch(struct sk_buff *msg, u32 pid, u32 seq, int flags,
> +		const struct switch_dev *dev)
> +{
> +	struct nlattr *p = NULL, *m = NULL;
> +	void *hdr;
> +	int i;
> +
> +	hdr = genlmsg_put(msg, pid, seq, &switch_fam, flags,
> +			SWITCH_CMD_NEW_ATTR);
> +	if (IS_ERR(hdr))
> +		return -1;
> +
> +	if (nla_put_u32(msg, SWITCH_ATTR_ID, dev->id))
> +		goto nla_put_failure;
> +	if (nla_put_string(msg, SWITCH_ATTR_DEV_NAME, dev->devname))
> +		goto nla_put_failure;
> +	if (nla_put_string(msg, SWITCH_ATTR_ALIAS, dev->alias))
> +		goto nla_put_failure;
> +	if (nla_put_string(msg, SWITCH_ATTR_NAME, dev->name))
> +		goto nla_put_failure;
> +	if (nla_put_u32(msg, SWITCH_ATTR_VLANS, dev->vlans))
> +		goto nla_put_failure;
> +	if (nla_put_u32(msg, SWITCH_ATTR_PORTS, dev->ports))
> +		goto nla_put_failure;
> +	if (nla_put_u32(msg, SWITCH_ATTR_CPU_PORT, dev->cpu_port))
> +		goto nla_put_failure;
> +
> +	m = nla_nest_start(msg, SWITCH_ATTR_PORTMAP);
> +	if (!m)
> +		goto nla_put_failure;
> +	for (i = 0; i < dev->ports; i++) {
> +		p = nla_nest_start(msg, SWITCH_ATTR_PORTS);
> +		if (!p)
> +			continue;
> +		if (dev->portmap[i].s) {
> +			if (nla_put_string(msg, SWITCH_PORTMAP_SEGMENT,
> +						dev->portmap[i].s))
> +				goto nla_put_failure;
> +			if (nla_put_u32(msg, SWITCH_PORTMAP_VIRT,
> +						dev->portmap[i].virt))
> +				goto nla_put_failure;
> +		}
> +		nla_nest_end(msg, p);
> +	}
> +	nla_nest_end(msg, m);
> +	return genlmsg_end(msg, hdr);
> +nla_put_failure:
> +	genlmsg_cancel(msg, hdr);
> +	return -EMSGSIZE;
> +}
> +
> +static int swconfig_dump_switches(struct sk_buff *skb,
> +		struct netlink_callback *cb)
> +{
> +	struct switch_dev *dev;
> +	int start = cb->args[0];
> +	int idx = 0;
> +
> +	swconfig_lock();
> +	list_for_each_entry(dev, &swdevs, dev_list) {
> +		if (++idx <= start)
> +			continue;
> +		if (swconfig_send_switch(skb, NETLINK_CB(cb->skb).portid,
> +				cb->nlh->nlmsg_seq, NLM_F_MULTI,
> +				dev) < 0)
> +			break;
> +	}
> +	swconfig_unlock();
> +	cb->args[0] = idx;
> +
> +	return skb->len;
> +}
> +
> +static int
> +swconfig_done(struct netlink_callback *cb)
> +{
> +	return 0;
> +}
> +
> +static struct genl_ops swconfig_ops[] = {
> +	{
> +		.cmd = SWITCH_CMD_LIST_GLOBAL,
> +		.doit = swconfig_list_attrs,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_LIST_VLAN,
> +		.doit = swconfig_list_attrs,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_LIST_PORT,
> +		.doit = swconfig_list_attrs,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_GET_GLOBAL,
> +		.doit = swconfig_get_attr,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_GET_VLAN,
> +		.doit = swconfig_get_attr,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_GET_PORT,
> +		.doit = swconfig_get_attr,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_SET_GLOBAL,
> +		.doit = swconfig_set_attr,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_SET_VLAN,
> +		.doit = swconfig_set_attr,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_SET_PORT,
> +		.doit = swconfig_set_attr,
> +		.policy = switch_policy,
> +	},
> +	{
> +		.cmd = SWITCH_CMD_GET_SWITCH,
> +		.dumpit = swconfig_dump_switches,
> +		.policy = switch_policy,
> +		.done = swconfig_done,
> +	}
> +};
> +
> +int
> +register_switch(struct switch_dev *dev, struct net_device *netdev)
> +{
> +	struct switch_dev *sdev;
> +	const int max_switches = 8 * sizeof(unsigned long);
> +	unsigned long in_use = 0;
> +	int i;
> +
> +	INIT_LIST_HEAD(&dev->dev_list);
> +	if (netdev) {
> +		dev->netdev = netdev;
> +		if (!dev->alias)
> +			dev->alias = netdev->name;
> +	}
> +	BUG_ON(!dev->alias);
> +
> +	if (dev->ports > 0) {
> +		dev->portbuf = kzalloc(sizeof(struct switch_port) *
> +				dev->ports, GFP_KERNEL);
> +		if (!dev->portbuf)
> +			return -ENOMEM;
> +		dev->portmap = kzalloc(sizeof(struct switch_portmap) *
> +				dev->ports, GFP_KERNEL);
> +		if (!dev->portmap) {
> +			kfree(dev->portbuf);
> +			return -ENOMEM;
> +		}
> +	}
> +	swconfig_defaults_init(dev);
> +	mutex_init(&dev->sw_mutex);
> +	swconfig_lock();
> +	dev->id = ++swdev_id;
> +
> +	list_for_each_entry(sdev, &swdevs, dev_list) {
> +		if (!sscanf(sdev->devname, SWCONFIG_DEVNAME, &i))
> +			continue;
> +		if (i < 0 || i > max_switches)
> +			continue;
> +
> +		set_bit(i, &in_use);
> +	}
> +	i = find_first_zero_bit(&in_use, max_switches);
> +
> +	if (i == max_switches) {
> +		swconfig_unlock();
> +		return -ENFILE;
> +	}
> +
> +	/* fill device name */
> +	snprintf(dev->devname, IFNAMSIZ, SWCONFIG_DEVNAME, i);
> +
> +	list_add(&dev->dev_list, &swdevs);
> +	swconfig_unlock();
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(register_switch);
> +
> +void
> +unregister_switch(struct switch_dev *dev)
> +{
> +	kfree(dev->portbuf);
> +	mutex_lock(&dev->sw_mutex);
> +	swconfig_lock();
> +	list_del(&dev->dev_list);
> +	swconfig_unlock();
> +	mutex_unlock(&dev->sw_mutex);
> +}
> +EXPORT_SYMBOL_GPL(unregister_switch);
> +
> +
> +static int __init
> +swconfig_init(void)
> +{
> +	int i, err;
> +
> +	INIT_LIST_HEAD(&swdevs);
> +	err = genl_register_family(&switch_fam);
> +	if (err)
> +		return err;
> +
> +	for (i = 0; i < ARRAY_SIZE(swconfig_ops); i++) {
> +		err = genl_register_ops(&switch_fam, &swconfig_ops[i]);
> +		if (err)
> +			goto unregister;
> +	}
> +
> +	return 0;
> +
> +unregister:
> +	genl_unregister_family(&switch_fam);
> +	return err;
> +}
> +
> +static void __exit
> +swconfig_exit(void)
> +{
> +	genl_unregister_family(&switch_fam);
> +}
> +
> +module_init(swconfig_init);
> +module_exit(swconfig_exit);
> +
> diff --git a/include/linux/swconfig.h b/include/linux/swconfig.h
> new file mode 100644
> index 0000000..fd96eec
> --- /dev/null
> +++ b/include/linux/swconfig.h
> @@ -0,0 +1,180 @@
> +/*
> + * Switch configuration API
> + *
> + * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +#ifndef _LINUX_SWITCH_H
> +#define _LINUX_SWITCH_H
> +
> +#include <net/genetlink.h>
> +#include <uapi/linux/swconfig.h>
> +
> +struct switch_dev;
> +struct switch_op;
> +struct switch_val;
> +struct switch_attr;
> +struct switch_attrlist;
> +struct switch_led_trigger;
> +
> +int register_switch(struct switch_dev *dev, struct net_device *netdev);
> +void unregister_switch(struct switch_dev *dev);
> +
> +/**
> + * struct switch_attrlist - attribute list
> + *
> + * @n_attr: number of attributes
> + * @attr: pointer to the attributes array
> + */
> +struct switch_attrlist {
> +	int n_attr;
> +	const struct switch_attr *attr;
> +};
> +
> +enum switch_port_speed {
> +	SWITCH_PORT_SPEED_UNKNOWN = 0,
> +	SWITCH_PORT_SPEED_10 = 10,
> +	SWITCH_PORT_SPEED_100 = 100,
> +	SWITCH_PORT_SPEED_1000 = 1000,
> +};
> +
> +struct switch_port_link {
> +	bool link;
> +	bool duplex;
> +	bool aneg;
> +	bool tx_flow;
> +	bool rx_flow;
> +	enum switch_port_speed speed;
> +};
> +
> +struct switch_port_stats {
> +	unsigned long tx_bytes;
> +	unsigned long rx_bytes;
> +};
> +
> +/**
> + * struct switch_dev_ops - switch driver operations
> + *
> + * @attr_global: global switch attribute list
> + * @attr_port: port attribute list
> + * @attr_vlan: vlan attribute list
> + *
> + * Callbacks:
> + *
> + * @get_vlan_ports: read the port list of a VLAN
> + * @set_vlan_ports: set the port list of a VLAN
> + *
> + * @get_port_pvid: get the primary VLAN ID of a port
> + * @set_port_pvid: set the primary VLAN ID of a port
> + *
> + * @apply_config: apply all changed settings to the switch
> + * @reset_switch: resetting the switch
> + *
> + * @get_port_link: read the port link status
> + * @get_port_stats: read the port statistics counters
> + */
> +struct switch_dev_ops {
> +	struct switch_attrlist attr_global, attr_port, attr_vlan;
> +
> +	int (*get_vlan_ports)(struct switch_dev *dev, struct switch_val *val);
> +	int (*set_vlan_ports)(struct switch_dev *dev, struct switch_val *val);
> +
> +	int (*get_port_pvid)(struct switch_dev *dev, int port, int *val);
> +	int (*set_port_pvid)(struct switch_dev *dev, int port, int val);
> +
> +	int (*apply_config)(struct switch_dev *dev);
> +	int (*reset_switch)(struct switch_dev *dev);
> +
> +	int (*get_port_link)(struct switch_dev *dev, int port,
> +			     struct switch_port_link *link);
> +	int (*get_port_stats)(struct switch_dev *dev, int port,
> +			      struct switch_port_stats *stats);
> +};
> +
> +/**
> + * struct switch_dev - switch device
> + *
> + * @ops: switch driver operations pointer
> + * @devname: switch device name (automatically filled)
> + * @name: switch driver name returned to user-space
> + * @alias: alias name for the switch (instead of ethX) returned to user-space
> + * @netdev: network device pointer if alias is not used
> + *
> + * @ports: number of physical switch ports
> + * @vlans: number of supported VLANs
> + * @cpu_port: identifier for the CPU port
> + */
> +struct switch_dev {
> +	const struct switch_dev_ops *ops;
> +	/* will be automatically filled */
> +	char devname[IFNAMSIZ];
> +
> +	const char *name;
> +	/* NB: either alias or netdev must be set */
> +	const char *alias;
> +	struct net_device *netdev;
> +
> +	int ports;
> +	int vlans;
> +	int cpu_port;
> +
> +	/* the following fields are internal for swconfig */
> +	int id;
> +	struct list_head dev_list;
> +	unsigned long def_global, def_port, def_vlan;
> +
> +	struct mutex sw_mutex;
> +	struct switch_port *portbuf;
> +	struct switch_portmap *portmap;
> +
> +	char buf[128];
> +};
> +
> +struct switch_port {
> +	u32 id;
> +	u32 flags;
> +};
> +
> +struct switch_portmap {
> +	u32 virt;
> +	const char *s;
> +};
> +
> +struct switch_val {
> +	const struct switch_attr *attr;
> +	int port_vlan;
> +	int len;
> +	union {
> +		const char *s;
> +		u32 i;
> +		struct switch_port *ports;
> +	} value;
> +};
> +
> +struct switch_attr {
> +	int disabled;
> +	int type;
> +	const char *name;
> +	const char *description;
> +
> +	int (*set)(struct switch_dev *dev, const struct switch_attr *attr,
> +			struct switch_val *val);
> +	int (*get)(struct switch_dev *dev, const struct switch_attr *attr,
> +			struct switch_val *val);
> +
> +	/* for driver internal use */
> +	int id;
> +	int ofs;
> +	int max;
> +};
> +
> +#endif /* _LINUX_SWITCH_H */
> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
> index 115add2..0a995be 100644
> --- a/include/uapi/linux/Kbuild
> +++ b/include/uapi/linux/Kbuild
> @@ -363,6 +363,7 @@ header-y += stddef.h
>  header-y += string.h
>  header-y += suspend_ioctls.h
>  header-y += swab.h
> +header-y += swconfig.h
>  header-y += synclink.h
>  header-y += sysctl.h
>  header-y += sysinfo.h
> diff --git a/include/uapi/linux/swconfig.h b/include/uapi/linux/swconfig.h
> new file mode 100644
> index 0000000..17cf178
> --- /dev/null
> +++ b/include/uapi/linux/swconfig.h
> @@ -0,0 +1,103 @@
> +/*
> + * Switch configuration API
> + *
> + * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#ifndef _UAPI_LINUX_SWITCH_H
> +#define _UAPI_LINUX_SWITCH_H
> +
> +#include <linux/types.h>
> +#include <linux/netdevice.h>
> +#include <linux/netlink.h>
> +#include <linux/genetlink.h>
> +#ifndef __KERNEL__
> +#include <netlink/netlink.h>
> +#include <netlink/genl/genl.h>
> +#include <netlink/genl/ctrl.h>
> +#endif
> +
> +/* main attributes */
> +enum {
> +	SWITCH_ATTR_UNSPEC,
> +	/* global */
> +	SWITCH_ATTR_TYPE,
> +	/* device */
> +	SWITCH_ATTR_ID,
> +	SWITCH_ATTR_DEV_NAME,
> +	SWITCH_ATTR_ALIAS,
> +	SWITCH_ATTR_NAME,
> +	SWITCH_ATTR_VLANS,
> +	SWITCH_ATTR_PORTS,
> +	SWITCH_ATTR_PORTMAP,
> +	SWITCH_ATTR_CPU_PORT,
> +	/* attributes */
> +	SWITCH_ATTR_OP_ID,
> +	SWITCH_ATTR_OP_TYPE,
> +	SWITCH_ATTR_OP_NAME,
> +	SWITCH_ATTR_OP_PORT,
> +	SWITCH_ATTR_OP_VLAN,
> +	SWITCH_ATTR_OP_VALUE_INT,
> +	SWITCH_ATTR_OP_VALUE_STR,
> +	SWITCH_ATTR_OP_VALUE_PORTS,
> +	SWITCH_ATTR_OP_DESCRIPTION,
> +	/* port lists */
> +	SWITCH_ATTR_PORT,
> +	SWITCH_ATTR_MAX
> +};
> +
> +enum {
> +	/* port map */
> +	SWITCH_PORTMAP_PORTS,
> +	SWITCH_PORTMAP_SEGMENT,
> +	SWITCH_PORTMAP_VIRT,
> +	SWITCH_PORTMAP_MAX
> +};
> +
> +/* commands */
> +enum {
> +	SWITCH_CMD_UNSPEC,
> +	SWITCH_CMD_GET_SWITCH,
> +	SWITCH_CMD_NEW_ATTR,
> +	SWITCH_CMD_LIST_GLOBAL,
> +	SWITCH_CMD_GET_GLOBAL,
> +	SWITCH_CMD_SET_GLOBAL,
> +	SWITCH_CMD_LIST_PORT,
> +	SWITCH_CMD_GET_PORT,
> +	SWITCH_CMD_SET_PORT,
> +	SWITCH_CMD_LIST_VLAN,
> +	SWITCH_CMD_GET_VLAN,
> +	SWITCH_CMD_SET_VLAN
> +};
> +
> +/* data types */
> +enum switch_val_type {
> +	SWITCH_TYPE_UNSPEC,
> +	SWITCH_TYPE_INT,
> +	SWITCH_TYPE_STRING,
> +	SWITCH_TYPE_PORTS,
> +	SWITCH_TYPE_NOVAL,
> +};
> +
> +/* port nested attributes */
> +enum {
> +	SWITCH_PORT_UNSPEC,
> +	SWITCH_PORT_ID,
> +	SWITCH_PORT_FLAG_TAGGED,
> +	SWITCH_PORT_ATTR_MAX
> +};
> +
> +#define SWITCH_ATTR_DEFAULTS_OFFSET	0x1000
> +
> +
> +#endif /* _UAPI_LINUX_SWITCH_H */


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Fainelli Oct. 22, 2013, 7:32 p.m. UTC | #2
2013/10/22 Dan Williams <dcbw@redhat.com>:
> On Tue, 2013-10-22 at 11:23 -0700, Florian Fainelli wrote:
>> This patch adds an Ethernet Switch generic netlink configuration API
>> which allows for doing the required configuration of managed Ethernet
>> switches commonly found in Wireless/Cable/DSL routers in the market.
>
> "swconfig" probably means "switch config", but is there any way to
> rename this away from the "sw" prefix, since "sw" typically means
> "software" and not "switch"?

Sure, how about something like "enetsw"? I would like to avoid using
"switch" too much since this is a C reserved keyword.
David Miller Oct. 22, 2013, 7:46 p.m. UTC | #3
From: Dan Williams <dcbw@redhat.com>
Date: Tue, 22 Oct 2013 14:22:26 -0500

> On Tue, 2013-10-22 at 11:23 -0700, Florian Fainelli wrote:
>> This patch adds an Ethernet Switch generic netlink configuration API
>> which allows for doing the required configuration of managed Ethernet
>> switches commonly found in Wireless/Cable/DSL routers in the market.
> 
> "swconfig" probably means "switch config", but is there any way to
> rename this away from the "sw" prefix, since "sw" typically means
> "software" and not "switch"?

Agreed.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Oct. 22, 2013, 7:47 p.m. UTC | #4
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 22 Oct 2013 12:32:29 -0700

> 2013/10/22 Dan Williams <dcbw@redhat.com>:
>> On Tue, 2013-10-22 at 11:23 -0700, Florian Fainelli wrote:
>>> This patch adds an Ethernet Switch generic netlink configuration API
>>> which allows for doing the required configuration of managed Ethernet
>>> switches commonly found in Wireless/Cable/DSL routers in the market.
>>
>> "swconfig" probably means "switch config", but is there any way to
>> rename this away from the "sw" prefix, since "sw" typically means
>> "software" and not "switch"?
> 
> Sure, how about something like "enetsw"? I would like to avoid using
> "switch" too much since this is a C reserved keyword.

"swtch"? :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 22, 2013, 7:53 p.m. UTC | #5
On 10/22/2013 11:23 AM, Florian Fainelli wrote:
> This patch adds an Ethernet Switch generic netlink configuration API
> which allows for doing the required configuration of managed Ethernet
> switches commonly found in Wireless/Cable/DSL routers in the market.
>
> Since this API is based on the Generic Netlink infrastructure it is very
> easy to extend a particular switch driver to support additional features
> and to adapt it to specific switches.
>

> So far the API includes support for:
>
> - getting/setting a port VLAN id
> - getting/setting VLAN port membership
> - getting a port link status
> - getting a port statistics counters
> - resetting a switch device
> - applying a configuration to a switch device
>

Did you consider exposing each physical switch port as a netdevice on
the host? I would assume your switch driver could do this.

Then you can drop the port specific attributes (link status, stats, etc)
and use existing interfaces. The win being my tools work equally well on
your real switch as they do on my software switch. Also by exposing net
devices you provide a mechanism to send packets over the port and trap
control packets.

Next instead of creating a switch specific netlink API could you use
the existing FDB API? Again what I would like is for my existing
applications to run on the switch without having to rewrite them. For
example it would be great to have 'bridge fdb show dev myswitch' report
the correct tables for both the Sw bridge, a real switch bridge, and
for the embedded SR-IOV bridge case.

I added Jamal and Neil because I think I remember talking about similar
ideas with them before.

Thanks,
.John


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Fainelli Oct. 22, 2013, 7:59 p.m. UTC | #6
2013/10/22 John Fastabend <john.r.fastabend@intel.com>:
> On 10/22/2013 11:23 AM, Florian Fainelli wrote:
>>
>> This patch adds an Ethernet Switch generic netlink configuration API
>> which allows for doing the required configuration of managed Ethernet
>> switches commonly found in Wireless/Cable/DSL routers in the market.
>>
>> Since this API is based on the Generic Netlink infrastructure it is very
>> easy to extend a particular switch driver to support additional features
>> and to adapt it to specific switches.
>>
>
>> So far the API includes support for:
>>
>> - getting/setting a port VLAN id
>> - getting/setting VLAN port membership
>> - getting a port link status
>> - getting a port statistics counters
>> - resetting a switch device
>> - applying a configuration to a switch device
>>
>
> Did you consider exposing each physical switch port as a netdevice on
> the host? I would assume your switch driver could do this.
>
> Then you can drop the port specific attributes (link status, stats, etc)
> and use existing interfaces. The win being my tools work equally well on
> your real switch as they do on my software switch. Also by exposing net
> devices you provide a mechanism to send packets over the port and trap
> control packets.

Well this is exactly what DSA does and which I do not like because it
is completely overkill for most switches out there which are using
802.1q tags and do not prepend/append proprietary tags for internal
traffic classification.

>
> Next instead of creating a switch specific netlink API could you use
> the existing FDB API? Again what I would like is for my existing
> applications to run on the switch without having to rewrite them. For
> example it would be great to have 'bridge fdb show dev myswitch' report
> the correct tables for both the Sw bridge, a real switch bridge, and
> for the embedded SR-IOV bridge case.

Ok, I know nothing about the FDB API, but will take a look and see if
that sounds suitable for the embedded use cases.

>
> I added Jamal and Neil because I think I remember talking about similar
> ideas with them before.

Thanks!
Neil Horman Oct. 22, 2013, 8:25 p.m. UTC | #7
On Tue, Oct 22, 2013 at 12:59:12PM -0700, Florian Fainelli wrote:
> 2013/10/22 John Fastabend <john.r.fastabend@intel.com>:
> > On 10/22/2013 11:23 AM, Florian Fainelli wrote:
> >>
> >> This patch adds an Ethernet Switch generic netlink configuration API
> >> which allows for doing the required configuration of managed Ethernet
> >> switches commonly found in Wireless/Cable/DSL routers in the market.
> >>
> >> Since this API is based on the Generic Netlink infrastructure it is very
> >> easy to extend a particular switch driver to support additional features
> >> and to adapt it to specific switches.
> >>
> >
> >> So far the API includes support for:
> >>
> >> - getting/setting a port VLAN id
> >> - getting/setting VLAN port membership
> >> - getting a port link status
> >> - getting a port statistics counters
> >> - resetting a switch device
> >> - applying a configuration to a switch device
> >>
> >
> > Did you consider exposing each physical switch port as a netdevice on
> > the host? I would assume your switch driver could do this.
> >
> > Then you can drop the port specific attributes (link status, stats, etc)
> > and use existing interfaces. The win being my tools work equally well on
> > your real switch as they do on my software switch. Also by exposing net
> > devices you provide a mechanism to send packets over the port and trap
> > control packets.
> 
> Well this is exactly what DSA does and which I do not like because it
> is completely overkill for most switches out there which are using
> 802.1q tags and do not prepend/append proprietary tags for internal
> traffic classification.
> 
> >
> > Next instead of creating a switch specific netlink API could you use
> > the existing FDB API? Again what I would like is for my existing
> > applications to run on the switch without having to rewrite them. For
> > example it would be great to have 'bridge fdb show dev myswitch' report
> > the correct tables for both the Sw bridge, a real switch bridge, and
> > for the embedded SR-IOV bridge case.
> 
> Ok, I know nothing about the FDB API, but will take a look and see if
> that sounds suitable for the embedded use cases.
> 
Further to Johns comments, why are you creating a new netlink protocol for this?
It seems that 90% of what you want to accomplish above is handled by rtnetlink.
As long as you write your driver properly, most of that should "just work".

Neil

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Oct. 22, 2013, 9:22 p.m. UTC | #8
From: Dan Williams <dcbw@redhat.com>
Date: Tue, 22 Oct 2013 16:25:50 -0500

> On Tue, 2013-10-22 at 15:47 -0400, David Miller wrote:
>> From: Florian Fainelli <f.fainelli@gmail.com>
>> Date: Tue, 22 Oct 2013 12:32:29 -0700
>> 
>> > 2013/10/22 Dan Williams <dcbw@redhat.com>:
>> >> On Tue, 2013-10-22 at 11:23 -0700, Florian Fainelli wrote:
>> >>> This patch adds an Ethernet Switch generic netlink configuration API
>> >>> which allows for doing the required configuration of managed Ethernet
>> >>> switches commonly found in Wireless/Cable/DSL routers in the market.
>> >>
>> >> "swconfig" probably means "switch config", but is there any way to
>> >> rename this away from the "sw" prefix, since "sw" typically means
>> >> "software" and not "switch"?
>> > 
>> > Sure, how about something like "enetsw"? I would like to avoid using
>> > "switch" too much since this is a C reserved keyword.
>> 
>> "swtch"? :-)
> 
> haha...  seriously though, "enetsw" or even "esw" or "ensw" would be
> better than plain "sw".  Your choice, I have no horse in the race other
> than the "not sw" horse :)

"enetsw" is fine by me :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Fainelli Oct. 22, 2013, 10:09 p.m. UTC | #9
2013/10/22 Neil Horman <nhorman@tuxdriver.com>:
> On Tue, Oct 22, 2013 at 12:59:12PM -0700, Florian Fainelli wrote:
>> 2013/10/22 John Fastabend <john.r.fastabend@intel.com>:
>> > On 10/22/2013 11:23 AM, Florian Fainelli wrote:
>> >>
>> >> This patch adds an Ethernet Switch generic netlink configuration API
>> >> which allows for doing the required configuration of managed Ethernet
>> >> switches commonly found in Wireless/Cable/DSL routers in the market.
>> >>
>> >> Since this API is based on the Generic Netlink infrastructure it is very
>> >> easy to extend a particular switch driver to support additional features
>> >> and to adapt it to specific switches.
>> >>
>> >
>> >> So far the API includes support for:
>> >>
>> >> - getting/setting a port VLAN id
>> >> - getting/setting VLAN port membership
>> >> - getting a port link status
>> >> - getting a port statistics counters
>> >> - resetting a switch device
>> >> - applying a configuration to a switch device
>> >>
>> >
>> > Did you consider exposing each physical switch port as a netdevice on
>> > the host? I would assume your switch driver could do this.
>> >
>> > Then you can drop the port specific attributes (link status, stats, etc)
>> > and use existing interfaces. The win being my tools work equally well on
>> > your real switch as they do on my software switch. Also by exposing net
>> > devices you provide a mechanism to send packets over the port and trap
>> > control packets.
>>
>> Well this is exactly what DSA does and which I do not like because it
>> is completely overkill for most switches out there which are using
>> 802.1q tags and do not prepend/append proprietary tags for internal
>> traffic classification.
>>
>> >
>> > Next instead of creating a switch specific netlink API could you use
>> > the existing FDB API? Again what I would like is for my existing
>> > applications to run on the switch without having to rewrite them. For
>> > example it would be great to have 'bridge fdb show dev myswitch' report
>> > the correct tables for both the Sw bridge, a real switch bridge, and
>> > for the embedded SR-IOV bridge case.
>>
>> Ok, I know nothing about the FDB API, but will take a look and see if
>> that sounds suitable for the embedded use cases.
>>
> Further to Johns comments, why are you creating a new netlink protocol for this?
> It seems that 90% of what you want to accomplish above is handled by rtnetlink.
> As long as you write your driver properly, most of that should "just work".

This is not a new netlink protocol, but a generic netlink family. Why
would I extend rtnetlink to cover the remaining 10% which are not
going to be used on desktop and servers when a new generic netlink
family is cheap and can be selectively disabled in the kernel?
Neil Horman Oct. 23, 2013, 11:34 a.m. UTC | #10
On Tue, Oct 22, 2013 at 03:09:32PM -0700, Florian Fainelli wrote:
> 2013/10/22 Neil Horman <nhorman@tuxdriver.com>:
> > On Tue, Oct 22, 2013 at 12:59:12PM -0700, Florian Fainelli wrote:
> >> 2013/10/22 John Fastabend <john.r.fastabend@intel.com>:
> >> > On 10/22/2013 11:23 AM, Florian Fainelli wrote:
> >> >>
> >> >> This patch adds an Ethernet Switch generic netlink configuration API
> >> >> which allows for doing the required configuration of managed Ethernet
> >> >> switches commonly found in Wireless/Cable/DSL routers in the market.
> >> >>
> >> >> Since this API is based on the Generic Netlink infrastructure it is very
> >> >> easy to extend a particular switch driver to support additional features
> >> >> and to adapt it to specific switches.
> >> >>
> >> >
> >> >> So far the API includes support for:
> >> >>
> >> >> - getting/setting a port VLAN id
> >> >> - getting/setting VLAN port membership
> >> >> - getting a port link status
> >> >> - getting a port statistics counters
> >> >> - resetting a switch device
> >> >> - applying a configuration to a switch device
> >> >>
> >> >
> >> > Did you consider exposing each physical switch port as a netdevice on
> >> > the host? I would assume your switch driver could do this.
> >> >
> >> > Then you can drop the port specific attributes (link status, stats, etc)
> >> > and use existing interfaces. The win being my tools work equally well on
> >> > your real switch as they do on my software switch. Also by exposing net
> >> > devices you provide a mechanism to send packets over the port and trap
> >> > control packets.
> >>
> >> Well this is exactly what DSA does and which I do not like because it
> >> is completely overkill for most switches out there which are using
> >> 802.1q tags and do not prepend/append proprietary tags for internal
> >> traffic classification.
> >>
> >> >
> >> > Next instead of creating a switch specific netlink API could you use
> >> > the existing FDB API? Again what I would like is for my existing
> >> > applications to run on the switch without having to rewrite them. For
> >> > example it would be great to have 'bridge fdb show dev myswitch' report
> >> > the correct tables for both the Sw bridge, a real switch bridge, and
> >> > for the embedded SR-IOV bridge case.
> >>
> >> Ok, I know nothing about the FDB API, but will take a look and see if
> >> that sounds suitable for the embedded use cases.
> >>
> > Further to Johns comments, why are you creating a new netlink protocol for this?
> > It seems that 90% of what you want to accomplish above is handled by rtnetlink.
> > As long as you write your driver properly, most of that should "just work".
> 
> This is not a new netlink protocol, but a generic netlink family. Why
Thats hair splitting.  The point I'm making here is that you're creating a new
communication path from user space to the kernel to do something that we already
have a communication path to do.

> would I extend rtnetlink to cover the remaining 10% which are not
> going to be used on desktop and servers when a new generic netlink
> family is cheap and can be selectively disabled in the kernel?
90% of it is already done on servers and desktops using rtnetlink (thats my
point), and you can reasonably add the other 10% (I think), if you just expose
the switch ports as their own ethernet interfaces.  You say DSA is overkill, but
if you just add the other switch ports as their own ethernet interfaces, you
would get most of the above work for free, which seems to me like less overkill
than a new netlink family and userspace tools. 

Regards
Neil

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Oct. 23, 2013, 11:47 a.m. UTC | #11
On 10/22/13 18:09, Florian Fainelli wrote:
> 2013/10/22 Neil Horman <nhorman@tuxdriver.com>:
>> On Tue, Oct 22, 2013 at 12:59:12PM -0700, Florian Fainelli wrote:
>>> 2013/10/22 John Fastabend <john.r.fastabend@intel.com>:
>>>> On 10/22/2013 11:23 AM, Florian Fainelli wrote:
>>>>>
>
>>>
>>> Ok, I know nothing about the FDB API, but will take a look and see if
>>> that sounds suitable for the embedded use cases.
>>>
>> Further to Johns comments, why are you creating a new netlink protocol for this?
>> It seems that 90% of what you want to accomplish above is handled by rtnetlink.
>> As long as you write your driver properly, most of that should "just work".
>
> This is not a new netlink protocol, but a generic netlink family. Why
> would I extend rtnetlink to cover the remaining 10% which are not
> going to be used on desktop and servers when a new generic netlink
> family is cheap and can be selectively disabled in the kernel?
>

Florian,

I think it would be fantastic if you adopt the FDB API. The comment
to use rtnetlink configure is valid. You can configure hardware
switches as John has shown. I realize you guys have invested
tons of time and this stuff has been tested by tons of people and this
is a painful exercise to go through, but:
having more than one approach for configuring/controlling kernel
switch interfaces is not ideal. If you use the rtnetlink API then one
can configure both the Linux bridge, embedded intel switches, etc with
iproute2. i.e the switch becomes a bridge. I see a lot of commonolity
between your model based on what you described and the current bridge.
Pull the latest iproute2 code and look at "bridge" command.

Essentially, the current bridged could be described as an entity
that does L2 switching:
a) it has bridge ports which are any netdevs on Linux
b) it has an FDB which constitutes a MAC address as the lookup and 
optionally a VLAN. You can control learning and flooding.
c) it has vlan filtering capabilities which you can turn on/off. The
vlan capability to sellect PVIDs is also built in.
d) It has multicast snooping

I think your model needs #a and #b, you can ignore the rest.
I am not quiet sure how vlan port membership will apply; an fdb for
each entry will have a vlan. You could also create one bridge per vlan
(not the best  approach) - ccing Vlad and Stephen.


cheers,
jamal




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Oct. 23, 2013, 12:04 p.m. UTC | #12
On 2013-10-23 1:47 PM, Jamal Hadi Salim wrote:
> Florian,
> 
> I think it would be fantastic if you adopt the FDB API. The comment
> to use rtnetlink configure is valid. You can configure hardware
> switches as John has shown. I realize you guys have invested
> tons of time and this stuff has been tested by tons of people and this
> is a painful exercise to go through, but:
> having more than one approach for configuring/controlling kernel
> switch interfaces is not ideal. If you use the rtnetlink API then one
> can configure both the Linux bridge, embedded intel switches, etc with
> iproute2. i.e the switch becomes a bridge. I see a lot of commonolity
> between your model based on what you described and the current bridge.
> Pull the latest iproute2 code and look at "bridge" command.
> 
> Essentially, the current bridged could be described as an entity
> that does L2 switching:
> a) it has bridge ports which are any netdevs on Linux
> b) it has an FDB which constitutes a MAC address as the lookup and 
> optionally a VLAN. You can control learning and flooding.
> c) it has vlan filtering capabilities which you can turn on/off. The
> vlan capability to sellect PVIDs is also built in.
> d) It has multicast snooping
> 
> I think your model needs #a and #b, you can ignore the rest.
> I am not quiet sure how vlan port membership will apply; an fdb for
> each entry will have a vlan. You could also create one bridge per vlan
> (not the best  approach) - ccing Vlad and Stephen.
I still don't understand how this is supposed to work with the kind of
switches that we're supporting with swconfig.

A typical switch has something like 5-8 ports (+ one port that goes to
the CPU), and handles the entire forwarding path on its own. It usually
allows creating VLANs and assigning ports to them (tagged, untagged),
but many (probably most) switches do not support controlling the
forwarding path via a MAC address based FDB.

Many also do not have support for a packet header to indicate the
incoming/outgoing switch port, so creating one netdev per port will work
only for link status, not for the data path.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Oct. 23, 2013, 12:53 p.m. UTC | #13
On 10/23/13 08:04, Felix Fietkau wrote:


> A typical switch has something like 5-8 ports (+ one port that goes to
> the CPU),

My opinion:
So exposing the 5-8 ports as netdevs would be useful. Giving access to
their stats through per-port netdevs etc. i.e a switch/bridge will show
up on bootup and the 5-8 ports as well. The 5-8 ports will show up
as bridge ports to the switch.
If something requires other "services" like l3 - I am assuming that
would show up in the cpu port, but its role is really to demux
and send it to ingress of the originating port on ASIC (i.e dont
think it should be exposed).

>and handles the entire forwarding path on its own.

This is default behavior. i.e learning and flooding.
Can you at least retrieve the fdb? example how to figure out which
port a specific MAC address resides?

>It usually
> allows creating VLANs and assigning ports to them (tagged, untagged),

I wasnt sure about the vlans<->port mapping as i stated in the earlier
email. So on this issue, I can see the challenge.
You could of course put vlan netdevs on top of switch ports and then
attach those to the bridge, but i cant see an approach if a switch port
can support more than one vlan without having multiple bridges. example:
bridgeA: link ports {swp0:vlan1, swp1:vlan2, swp0:vlan4}
bridgeB: link ports {swp0:vlan3, swp1:vlan4, swp1:vlan2}


 > but many (probably most) switches do not support controlling the
> forwarding path via a MAC address based FDB.
>

Ok, so operations like fdb_add/del will be disallowed. This is really
up to the driver to not expose such ops.

> Many also do not have support for a packet header to indicate the
> incoming/outgoing switch port, so creating one netdev per port will work
> only for link status, not for the data path.

You mean when such a packet arrives on the "cpu" port, you wont know the
originating port?

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Oct. 23, 2013, 1:31 p.m. UTC | #14
On 2013-10-23 2:53 PM, Jamal Hadi Salim wrote:
> On 10/23/13 08:04, Felix Fietkau wrote:
> 
> 
>> A typical switch has something like 5-8 ports (+ one port that goes to
>> the CPU),
> 
> My opinion:
> So exposing the 5-8 ports as netdevs would be useful. Giving access to
> their stats through per-port netdevs etc. i.e a switch/bridge will show
> up on bootup and the 5-8 ports as well. The 5-8 ports will show up
> as bridge ports to the switch.
So you would like to have 'dummy' netdevs that don't actually work like
real ones, just to get stats?

> If something requires other "services" like l3 - I am assuming that
> would show up in the cpu port, but its role is really to demux
> and send it to ingress of the originating port on ASIC (i.e dont
> think it should be exposed).
Many of these switches are designed to work completely standalone, i.e.
they receive their configuration once and then do their thing, often
they don't even have special treatment for the CPU port.

>>and handles the entire forwarding path on its own.
> 
> This is default behavior. i.e learning and flooding.
> Can you at least retrieve the fdb? example how to figure out which
> port a specific MAC address resides?
On some of them, but not all.

>>It usually
>> allows creating VLANs and assigning ports to them (tagged, untagged),
> 
> I wasnt sure about the vlans<->port mapping as i stated in the earlier
> email. So on this issue, I can see the challenge.
> You could of course put vlan netdevs on top of switch ports and then
> attach those to the bridge, but i cant see an approach if a switch port
> can support more than one vlan without having multiple bridges. example:
> bridgeA: link ports {swp0:vlan1, swp1:vlan2, swp0:vlan4}
> bridgeB: link ports {swp0:vlan3, swp1:vlan4, swp1:vlan2}
So even more dummy interfaces that serve no real purpose other than
configuration?

>  > but many (probably most) switches do not support controlling the
>> forwarding path via a MAC address based FDB.
> 
> Ok, so operations like fdb_add/del will be disallowed. This is really
> up to the driver to not expose such ops.
> 
>> Many also do not have support for a packet header to indicate the
>> incoming/outgoing switch port, so creating one netdev per port will work
>> only for link status, not for the data path.
> 
> You mean when such a packet arrives on the "cpu" port, you wont know the
> originating port?
Correct. I still get the impression that the model you're describing is
mostly incompatible with what we're trying to do, and comes at the cost
of quite a bit of extra complexity and bloat, not just on the
implementation side, but on the configuration side as well.
It also seems to make it more difficult to support vendor specific
features. I strongly doubt that the slight increase in consistency
between different kinds of switches/bridges is worth all of these extra
costs.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Oct. 23, 2013, 2:09 p.m. UTC | #15
On 10/23/13 09:31, Felix Fietkau wrote:
> On 2013-10-23 2:53 PM, Jamal Hadi Salim wrote:

> So you would like to have 'dummy' netdevs that don't actually work like
> real ones, just to get stats?

Not just stats, but other utilities, example:
*operational status read and admin status control,
*MAC address setting?
*MTU setting
* If something shows up on the cpu port and comes up, we can make it 
appear to be from such a netdev (for the case where this applies)
* etc

> Many of these switches are designed to work completely standalone, i.e.
> they receive their configuration once and then do their thing, often
> they don't even have special treatment for the CPU port.
>

So if i understood the worst case scenario:
- no packets will ever come to the CPU
- minimal config only such as configuring ports and what vlans they
accept
- you cant query the device for anything else not even stats

>> Can you at least retrieve the fdb? example how to figure out which
>> port a specific MAC address resides?
> On some of them, but not all.
>

I think this would be a fit for netdev->features to set capabilities at
initialization.
So canSetfdb, canGetfdb, canDelfdb etc


>> can support more than one vlan without having multiple bridges. example:
>> bridgeA: link ports {swp0:vlan1, swp1:vlan2, swp0:vlan4}
>> bridgeB: link ports {swp0:vlan3, swp1:vlan4, swp1:vlan2}
> So even more dummy interfaces that serve no real purpose other than
> configuration?

Yes. It may sound rediculous(trademark for that owned by DaveM), but
given the returns that all other classical linux tools work, I think it
is worth it.
Disclaimer: I still think this part is acrobatic in nature i.e no good
one-to-one mapping

> Correct.

How do you deal with those situations today example when a packet
shows up in the cpu port and they require routing?
Do you have one monolithic switch netdev ?

>I still get the impression that the model you're describing is
> mostly incompatible with what we're trying to do, and comes at the cost
> of quite a bit of extra complexity and bloat, not just on the
> implementation side, but on the configuration side as well.

/Sigh
I understand it is a dilema especially when you have your model proven
already with users.
The danger is one-offs where certain tools only work with certain
instantiations of common features. From a usability perspective,
it would be nice to use iproute2, ifconfig etc on the switch/ports and
not learn another tool (or program the switch to a different API).

> It also seems to make it more difficult to support vendor specific
> features. I strongly doubt that the slight increase in consistency
> between different kinds of switches/bridges is worth all of these extra
> costs.

I am not privy to what specific vendor features exist that are out of
whack. But note:
We have ability to set capabilities (netdev->features is one, but you 
can add another netdev->field). Would it not make sense for the driver
  to set such capabilities and the generic code to turn on/off certain
things? Example turn on netdev->ops->fdb_add if the switch is capable
etc.

cheers,
jamal


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Oct. 23, 2013, 2:32 p.m. UTC | #16
On 2013-10-23 4:09 PM, Jamal Hadi Salim wrote:
> On 10/23/13 09:31, Felix Fietkau wrote:
>> On 2013-10-23 2:53 PM, Jamal Hadi Salim wrote:
> 
>> So you would like to have 'dummy' netdevs that don't actually work like
>> real ones, just to get stats?
> 
> Not just stats, but other utilities, example:
> *operational status read and admin status control,

> *MAC address setting?
Typically ignored by switches.

> *MTU setting
Can usually not be controlled per-port. Where supported, it is usually a
global configuration parameter for the switch.

> * If something shows up on the cpu port and comes up, we can make it 
> appear to be from such a netdev (for the case where this applies)
I think that's actually more confusing for users if they find the same
kind of devices on multiple different switches, and on some they can be
used directly, on others they cannot.

> * etc
> 
>> Many of these switches are designed to work completely standalone, i.e.
>> they receive their configuration once and then do their thing, often
>> they don't even have special treatment for the CPU port.
>>
> 
> So if i understood the worst case scenario:
> - no packets will ever come to the CPU
> - minimal config only such as configuring ports and what vlans they
> accept
> - you cant query the device for anything else not even stats
Correct.

>>> Can you at least retrieve the fdb? example how to figure out which
>>> port a specific MAC address resides?
>> On some of them, but not all.
> I think this would be a fit for netdev->features to set capabilities at
> initialization.
> So canSetfdb, canGetfdb, canDelfdb etc

>>> can support more than one vlan without having multiple bridges. example:
>>> bridgeA: link ports {swp0:vlan1, swp1:vlan2, swp0:vlan4}
>>> bridgeB: link ports {swp0:vlan3, swp1:vlan4, swp1:vlan2}
>> So even more dummy interfaces that serve no real purpose other than
>> configuration?
> 
> Yes. It may sound rediculous(trademark for that owned by DaveM), but
> given the returns that all other classical linux tools work, I think it
> is worth it.
The classical Linux tools here only cover the most basic configuration
parts. In many cases, separate configuration options are needed. For
example, on some switches, forwarding table IDs can be assigned to VLANs.
Also, the switch driver is completely independent of the network device
driver that drives the port connected to the CPU port of the switch. The
only ways I can imagine implementing this in the Linux network stack
involve an unhealthy amount of layering violations or other forms of
ugly hackery.

> Disclaimer: I still think this part is acrobatic in nature i.e no good
> one-to-one mapping
> 
>> Correct.
> 
> How do you deal with those situations today example when a packet
> shows up in the cpu port and they require routing?
> Do you have one monolithic switch netdev ?
The switch driver usually attaches itself as a PHY driver, there is no
monolithic switch netdev.

>>I still get the impression that the model you're describing is
>> mostly incompatible with what we're trying to do, and comes at the cost
>> of quite a bit of extra complexity and bloat, not just on the
>> implementation side, but on the configuration side as well.
> 
> /Sigh
> I understand it is a dilema especially when you have your model proven
> already with users.
> The danger is one-offs where certain tools only work with certain
> instantiations of common features. From a usability perspective,
> it would be nice to use iproute2, ifconfig etc on the switch/ports and
> not learn another tool (or program the switch to a different API).
I fully agree that this would be nice to have. I've given quite a bit of
thought to trying to figure out if there's a simple clean way to
implement this, but in all of the proposals I've seen so far, the costs
(complexity, bloat, quirky interfaces) seem to massively outweigh the
benefits.

>> It also seems to make it more difficult to support vendor specific
>> features. I strongly doubt that the slight increase in consistency
>> between different kinds of switches/bridges is worth all of these extra
>> costs.
> 
> I am not privy to what specific vendor features exist that are out of
> whack. But note:
> We have ability to set capabilities (netdev->features is one, but you 
> can add another netdev->field). Would it not make sense for the driver
>   to set such capabilities and the generic code to turn on/off certain
> things? Example turn on netdev->ops->fdb_add if the switch is capable
> etc.
I don't think bloating up the netdev feature flags for lots of
single-vendor fields is a good idea. swconfig simply allows the driver
to register its own global, per-port and per-vlan attributes and user
space can discover them.

That also avoids the nasty issue of userspace code having to know about
all possible vendor specific features and bits of status information.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Oct. 25, 2013, 11:43 a.m. UTC | #17
Hi Felix,

Sorry for the latency - some distractions on the side.

On 10/23/13 10:32, Felix Fietkau wrote:
> On 2013-10-23 4:09 PM, Jamal Hadi Salim wrote:

>> *MAC address setting?
> Typically ignored by switches.
>

Ok, I take it the minority allow you to do this.
For most, the switch port has some factory shipped MAC address?

>> *MTU setting
> Can usually not be controlled per-port. Where supported, it is usually a
> global configuration parameter for the switch.

Does that mean one mtu for all switch ports on such devices?


>> * If something shows up on the cpu port and comes up, we can make it
>> appear to be from such a netdev (for the case where this applies)
> I think that's actually more confusing for users if they find the same
> kind of devices on multiple different switches, and on some they can be
> used directly, on others they cannot.
>

But how does it work today for the case where you have one chip that
wont pass up the tag to the cpu and another that does? i.e what
happens to packets that end up being shunted to CPU?

> The classical Linux tools here only cover the most basic configuration
> parts. In many cases, separate configuration options are needed. For
> example, on some switches, forwarding table IDs can be assigned to VLANs.

Multiple forwarding tables?

> Also, the switch driver is completely independent of the network device
> driver that drives the port connected to the CPU port of the switch.

I guess this is because one piece manages attributes and other is
for packet processing?
There is good precedence in a few embedded systems which are
equally challenged but still expose ports as netdevs.

>The
> only ways I can imagine implementing this in the Linux network stack
> involve an unhealthy amount of layering violations or other forms of
> ugly hackery.
>


> The switch driver usually attaches itself as a PHY driver, there is no
> monolithic switch netdev.
>

Shouldnt the PHY driver be owned by some netdev?

> I fully agree that this would be nice to have. I've given quite a bit of
> thought to trying to figure out if there's a simple clean way to
> implement this, but in all of the proposals I've seen so far, the costs
> (complexity, bloat, quirky interfaces) seem to massively outweigh the
> benefits.
>

I can understand the massive differences in capabilities make this
harder to retrofit. But if the only cause for impendance mismatch
is these capability differences, I think it can be resolved.
We need a way to discover them and only use those available.

> I don't think bloating up the netdev feature flags for lots of
> single-vendor fields is a good idea.

I agree if you say there is a variety of capabilities.
But if this is to be resolved - there has to be a way for these
capabilities to be advertised by low level (and netdev->features
is our only vehicle at the moment). We could have switch features
in addition etc etc.

> swconfig simply allows the driver
> to register its own global, per-port and per-vlan attributes and user
> space can discover them.
>
> That also avoids the nasty issue of userspace code having to know about
> all possible vendor specific features and bits of status information.
>

So it seems to me you already have taken care of this piece.
Why not pull that into the netdev or bridge core and then re-use it?

cheers,
jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Oct. 25, 2013, 1:01 p.m. UTC | #18
On 2013-10-25 1:43 PM, Jamal Hadi Salim wrote:
> Hi Felix,
> 
> Sorry for the latency - some distractions on the side.
> 
> On 10/23/13 10:32, Felix Fietkau wrote:
>> On 2013-10-23 4:09 PM, Jamal Hadi Salim wrote:
> 
>>> *MAC address setting?
>> Typically ignored by switches.
>>
> 
> Ok, I take it the minority allow you to do this.
> For most, the switch port has some factory shipped MAC address?
I think it's common for the switch to have a global MAC address, not a
per-port one.

>>> *MTU setting
>> Can usually not be controlled per-port. Where supported, it is usually a
>> global configuration parameter for the switch.
> 
> Does that mean one mtu for all switch ports on such devices?
Correct.

>>> * If something shows up on the cpu port and comes up, we can make it
>>> appear to be from such a netdev (for the case where this applies)
>> I think that's actually more confusing for users if they find the same
>> kind of devices on multiple different switches, and on some they can be
>> used directly, on others they cannot.
> 
> But how does it work today for the case where you have one chip that
> wont pass up the tag to the cpu and another that does? i.e what
> happens to packets that end up being shunted to CPU?
'won't pass up the tag'? The switch is treated in pretty much the same
way as a normal managed standalone switch (you know, one you can buy in
a shop and plug your Ethernet cable into).
You simply tell it, which VLANs to put on which ports, and make the
ports tagged or untagged.
The link between the switch and the CPU is not really special, for the
switch it's just another port. This way of configuring works with pretty
much all switches that we're using.

>> The classical Linux tools here only cover the most basic configuration
>> parts. In many cases, separate configuration options are needed. For
>> example, on some switches, forwarding table IDs can be assigned to VLANs.
> 
> Multiple forwarding tables?
Yes, some switches have them, and they can be useful when dealing with
multiple VLANs.

>> Also, the switch driver is completely independent of the network device
>> driver that drives the port connected to the CPU port of the switch.
> 
> I guess this is because one piece manages attributes and other is
> for packet processing?
> There is good precedence in a few embedded systems which are
> equally challenged but still expose ports as netdevs.
No, because the connection between the CPU and the switch is handled by
a normal Ethernet MAC. The Ethernet chip doesn't care if there's a
switch connected to it, or a regular PHY.
It's just a normal MII connection, nothing more.

>>The
>> only ways I can imagine implementing this in the Linux network stack
>> involve an unhealthy amount of layering violations or other forms of
>> ugly hackery.
>> The switch driver usually attaches itself as a PHY driver, there is no
>> monolithic switch netdev.
> 
> Shouldnt the PHY driver be owned by some netdev?
Right, the netdev that owns the PHY is a normal Ethernet MAC, running
any normal Linux Ethernet driver.

>> I fully agree that this would be nice to have. I've given quite a bit of
>> thought to trying to figure out if there's a simple clean way to
>> implement this, but in all of the proposals I've seen so far, the costs
>> (complexity, bloat, quirky interfaces) seem to massively outweigh the
>> benefits.
> I can understand the massive differences in capabilities make this
> harder to retrofit. But if the only cause for impendance mismatch
> is these capability differences, I think it can be resolved.
> We need a way to discover them and only use those available.
I remain absolutely unconvinced that this will make the end result
better. Right now, these switches act like separate devices, because
aside from the fact that they're put on the same board with other
components, they pretty much *are* separate devices.

You seem to insist on treating it as a kind of port multiplexer + bridge
accelerator instead of a mostly standalone switch.

This may work for some devices, but on others this simply a model that
the hardware wasn't designed for. Sure, we could try to cram in all
those special cases, extra options, and hack through the layers where
they're in the way. If *all* you care about is being able to reuse the
existing interfaces, that might even seem like a good idea.

On the other hand, I've pointed out quite a few examples where the model
of trying to cram it into the bridge API is just a bad fit in general.

>> I don't think bloating up the netdev feature flags for lots of
>> single-vendor fields is a good idea.
> 
> I agree if you say there is a variety of capabilities.
> But if this is to be resolved - there has to be a way for these
> capabilities to be advertised by low level (and netdev->features
> is our only vehicle at the moment). We could have switch features
> in addition etc etc.
Aside from the fact that the swconfig code is already there, the model
that it uses is inherently simple. I worry about all the extra
complexity that we will have to add to try to retrofit this into a
mostly incompatible configuration model.

>> swconfig simply allows the driver
>> to register its own global, per-port and per-vlan attributes and user
>> space can discover them.
>>
>> That also avoids the nasty issue of userspace code having to know about
>> all possible vendor specific features and bits of status information.
> So it seems to me you already have taken care of this piece.
> Why not pull that into the netdev or bridge core and then re-use it?
Because it just doesn't fit there very well.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Oct. 27, 2013, 5:19 p.m. UTC | #19
On 10/25/13 09:01, Felix Fietkau wrote:
> On 2013-10-25 1:43 PM, Jamal Hadi Salim wrote:

> I think it's common for the switch to have a global MAC address, not a
> per-port one.

Ok, I see. Real cheep.

> 'won't pass up the tag'? The switch is treated in pretty much the same
> way as a normal managed standalone switch (you know, one you can buy in
> a shop and plug your Ethernet cable into).
> You simply tell it, which VLANs to put on which ports, and make the
> ports tagged or untagged.
> The link between the switch and the CPU is not really special, for the
> switch it's just another port. This way of configuring works with pretty
> much all switches that we're using.

So does it get its own MAC address? Other than flooding broadcasts,
how does one end up sending packets to the cpu?

> Yes, some switches have them, and they can be useful when dealing with
> multiple VLANs.

Very nice. So we go from one extreme of cheep to sophisticated ;->
I think the only way you can achieve multiple tables on the bridge
is by creating multiple bridges.

> No, because the connection between the CPU and the switch is handled by
> a normal Ethernet MAC. The Ethernet chip doesn't care if there's a
> switch connected to it, or a regular PHY.
> It's just a normal MII connection, nothing more.
>
[..]
 >
 > Right, the netdev that owns the PHY is a normal Ethernet MAC, running
 > any normal Linux Ethernet driver.
 >
[..]
> I remain absolutely unconvinced that this will make the end result
> better. Right now, these switches act like separate devices, because
> aside from the fact that they're put on the same board with other
> components, they pretty much *are* separate devices.
>
> You seem to insist on treating it as a kind of port multiplexer + bridge
> accelerator instead of a mostly standalone switch.
>


Yes, the above is the point i was making.
I apologize for sounding like a broken record, but to just re-iterate:
there are, if i recall correctly, several drivers  in the kernel
which are challenged as such (with single entry point into the CPU)
which expose multiple netdevs with the driver acting as mux point.

> This may work for some devices, but on others this simply a model that
> the hardware wasn't designed for.

I agree. But what i just described above is not new. A lot of embedded
multiport NICs tend to be handicapped in exactly the same way.

> Sure, we could try to cram in all
> those special cases, extra options, and hack through the layers where
> they're in the way. If *all* you care about is being able to reuse the
> existing interfaces, that might even seem like a good idea.
>

I do care a lot about using existing interfaces ;-> Great usability
for someone to run a tool that has been around for 20 years and it
works. If i can just reuse my scripts without having to invent
new ones etc etc.

> On the other hand, I've pointed out quite a few examples where the model
> of trying to cram it into the bridge API is just a bad fit in general.
>

Sorry Felix, nothing you described is insurmountable.
The challenge here is non-technical:
You already have code that has been proven and is deployed for what 
appears to be sometime now.
I totally empathize.

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Fainelli Oct. 27, 2013, 6:14 p.m. UTC | #20
2013/10/27 Jamal Hadi Salim <jhs@mojatatu.com>:
> On 10/25/13 09:01, Felix Fietkau wrote:
>>
>> On 2013-10-25 1:43 PM, Jamal Hadi Salim wrote:
>
>
>> I think it's common for the switch to have a global MAC address, not a
>> per-port one.
>
>
> Ok, I see. Real cheep.

They are yes, the only "fancy" features these switches allow is
basically to set a given's port vlan id, which is already a huge
improvement compared to the vendor provided firmware.

>
>
>> 'won't pass up the tag'? The switch is treated in pretty much the same
>> way as a normal managed standalone switch (you know, one you can buy in
>> a shop and plug your Ethernet cable into).
>> You simply tell it, which VLANs to put on which ports, and make the
>> ports tagged or untagged.
>> The link between the switch and the CPU is not really special, for the
>> switch it's just another port. This way of configuring works with pretty
>> much all switches that we're using.
>
>
> So does it get its own MAC address? Other than flooding broadcasts,
> how does one end up sending packets to the cpu?

The switch does have an address learning process which is usually not
controlled by software at all, so yes, flooding is usually the way to
get it to the CPU.

>
>
>> Yes, some switches have them, and they can be useful when dealing with
>> multiple VLANs.
>
>
> Very nice. So we go from one extreme of cheep to sophisticated ;->
> I think the only way you can achieve multiple tables on the bridge
> is by creating multiple bridges.
>
>
>> No, because the connection between the CPU and the switch is handled by
>> a normal Ethernet MAC. The Ethernet chip doesn't care if there's a
>> switch connected to it, or a regular PHY.
>> It's just a normal MII connection, nothing more.
>>
> [..]
>
>>
>> Right, the netdev that owns the PHY is a normal Ethernet MAC, running
>> any normal Linux Ethernet driver.
>>
> [..]
>
>> I remain absolutely unconvinced that this will make the end result
>> better. Right now, these switches act like separate devices, because
>> aside from the fact that they're put on the same board with other
>> components, they pretty much *are* separate devices.
>>
>> You seem to insist on treating it as a kind of port multiplexer + bridge
>> accelerator instead of a mostly standalone switch.
>>
>
>
> Yes, the above is the point i was making.
> I apologize for sounding like a broken record, but to just re-iterate:
> there are, if i recall correctly, several drivers  in the kernel
> which are challenged as such (with single entry point into the CPU)
> which expose multiple netdevs with the driver acting as mux point.

Which exact drivers are you refering to? If we are talking about DSA
then yes, this is correct, but it is completely Ethernet MAC driver
agnostic.

>
>
>> This may work for some devices, but on others this simply a model that
>> the hardware wasn't designed for.
>
>
> I agree. But what i just described above is not new. A lot of embedded
> multiport NICs tend to be handicapped in exactly the same way.

Why would we expose the hardware switch physical ports as netdevs if
we cannot even any control over their data-path? Unlike these
multiport NICs, the only traffic you see and you can control is the
one from your CPU port.

>
>
>> Sure, we could try to cram in all
>> those special cases, extra options, and hack through the layers where
>> they're in the way. If *all* you care about is being able to reuse the
>> existing interfaces, that might even seem like a good idea.
>>
>
> I do care a lot about using existing interfaces ;-> Great usability
> for someone to run a tool that has been around for 20 years and it
> works. If i can just reuse my scripts without having to invent
> new ones etc etc.

I do not really see how we could bend the existing interface (is it
rtnetlink we are talking about or something else btw?) to expose these
switches, maybe we could with iproute2, but still, the user-space
interface/tool is far from being the problem here.


>
>
>> On the other hand, I've pointed out quite a few examples where the model
>> of trying to cram it into the bridge API is just a bad fit in general.
>>
>
> Sorry Felix, nothing you described is insurmountable.
> The challenge here is non-technical:
> You already have code that has been proven and is deployed for what appears
> to be sometime now.
> I totally empathize.

I don't think at any point in this discussion there was a mention that
we do not want to change the user or kernel interface in OpenWrt
because we have been using this for the past 5 years, on the contrary,
if we are bringing this to a wide audience, this is to get some proper
review and eventually change it.
Felix Fietkau Oct. 27, 2013, 7:51 p.m. UTC | #21
On 2013-10-27 6:19 PM, Jamal Hadi Salim wrote:
> On 10/25/13 09:01, Felix Fietkau wrote:
>> 'won't pass up the tag'? The switch is treated in pretty much the same
>> way as a normal managed standalone switch (you know, one you can buy in
>> a shop and plug your Ethernet cable into).
>> You simply tell it, which VLANs to put on which ports, and make the
>> ports tagged or untagged.
>> The link between the switch and the CPU is not really special, for the
>> switch it's just another port. This way of configuring works with pretty
>> much all switches that we're using.
> 
> So does it get its own MAC address? Other than flooding broadcasts,
> how does one end up sending packets to the cpu?
That question does not make any sense to me. Aside from low level
control frames like pause frames for flow control, the switch has no
need to send packets to the CPU port on its own.
Remember what I told you about the switch being a *separate* entity from
the NIC that connects it to the CPU.

>> I remain absolutely unconvinced that this will make the end result
>> better. Right now, these switches act like separate devices, because
>> aside from the fact that they're put on the same board with other
>> components, they pretty much *are* separate devices.
>>
>> You seem to insist on treating it as a kind of port multiplexer + bridge
>> accelerator instead of a mostly standalone switch.
> 
> Yes, the above is the point i was making.
> I apologize for sounding like a broken record, but to just re-iterate:
> there are, if i recall correctly, several drivers  in the kernel
> which are challenged as such (with single entry point into the CPU)
> which expose multiple netdevs with the driver acting as mux point.
DSA does this, and last time I looked, it pushes *all* bridge traffic
through the CPU, making it completely unusable for slower embedded CPUs.

If I remember correctly, adding support 'bridge acceleration' was left
as an exercise for the reader and never actually implemented.

Sure, this could be fixed somehow, but even then the model and
assumptions that DSA is built on simply don't work for some of the
dumber switches that we support.

>> This may work for some devices, but on others this simply a model that
>> the hardware wasn't designed for.
> 
> I agree. But what i just described above is not new. A lot of embedded
> multiport NICs tend to be handicapped in exactly the same way.
> 
>> Sure, we could try to cram in all
>> those special cases, extra options, and hack through the layers where
>> they're in the way. If *all* you care about is being able to reuse the
>> existing interfaces, that might even seem like a good idea.
> 
> I do care a lot about using existing interfaces ;-> Great usability
> for someone to run a tool that has been around for 20 years and it
> works. If i can just reuse my scripts without having to invent
> new ones etc etc.
I see that. But please stop treating this as the *only* factor that
matters! I'd like to see a more balanced cost/benefit analysis.

>> On the other hand, I've pointed out quite a few examples where the model
>> of trying to cram it into the bridge API is just a bad fit in general.
> 
> Sorry Felix, nothing you described is insurmountable.
I'm not saying it's insurmountable, I'm saying it's impractical!
It makes one aspect (code reuse) better in some cases, while making lots
of other aspects worse.

> The challenge here is non-technical:
> You already have code that has been proven and is deployed for what 
> appears to be sometime now.
> I totally empathize.
Please stop making it look like this is the primary issue. Sure, it's
more convenient for us to reuse the existing code, but it's far from
being the only important factor here!
As an embedded Linux developer, I care a lot about fighting complexity
and bloat, and those do tend to be much harder to deal with than a bit
of API consistency.

I get the sense that trying to communicate on an abstract level gets us
nowhere in this discussion, so let me make it a bit more specific with
some examples:

One of the currently very common switches in many embedded devices is
the RTL8366/RTL8367. It has some flexibility when it comes to
configuring VLANs, and it's one of the few ones where you can configure
a forwarding table for a VLAN (which spans multiple ports), which allows
software bridging between multiple VLANs.
However, what this switch does *not* support is adding a header/trailer
to packets to indicate the originating port.
This means that all per-port netdevs will be dummy ports which don't
include the data path.

So let's say you have a configuration where you're using VLAN ID 4 on
port 1, and you want to bridge it to VLAN ID 400 on port 2.

Sounds easy enough, you can easily create a bridge that spans port1.4
and port2.400. Except, this particular switch (like pretty much any
other switch supported by swconfig) isn't actually able to handle such a
configuration on its own.
It needs two VLAN configurations, with different forwarding table IDs,
and then the software bridge on the CPU port needs to forward between
the two different VLANs.
To be able to handle such a configuration, the code would have to detect
this kind of special case scenario, somehow hook itself via rx handler
into the NIC connected to the CPU port and emulate that VLAN ID
replacement behavior.

With swconfig, you create two VLANs: VLAN 4, containing CPU and port1;
VLAN 400, containing CPU and port2. You then create a software bridge
between eth0.4 and eth0.400 (assuming eth0 is the NIC connected to the
switch).

In a different scenario, the code would also have to detect
configurations that the switch isn't able to handle, e.g.: bridging
port1.4 to eth1 and port2.4 to eth2.
Such a configuration wouldn't work at all with such a switch, because
the CPU isn't able to tell apart traffic from port1 and port2, and
there's no way to tell the switch that port1.4 and port2.4 should not be
connected to each other, but both should go to the CPU.

Those are just two simple scenarios from the top of my head - I'm pretty
sure I could come up with a long list of further corner cases and
quirks, which are simply either difficult to deal with, or completely
unnatural in the model that you're describing.

Trying to make all of these cases work in the code will make the whole
thing a lot more difficult to deal with and maintain. It will also make
it much harder for the user to figure out, what configurations work, and
what configurations don't.

Especially the case with reusing VLANs on different ports (but not
connecting them to each other) is something that can easily work with
software devices, but cannot be emulated on most embedded device
switches. The software bridge configuration model raises a lot of
expectations that these switches simply cannot meet.

If you look at the swconfig model, you will see that the abstraction
clearly communicates the limitations of these typical switches.

The configuration model simply doesn't even let you express these kinds
of unsuppported configurations that seem normal in the tools used to set
up software bridges/vlans.
At the same time, it's fairly consistent across the range of different
chips that we have drivers for. That certainly leaves a much smaller
amount of traps and surprises for users, compared to trying to emulate
the software bridge model by hacking through the layers.

Hopefully this will clear a few things up for you.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Oct. 28, 2013, 10:29 p.m. UTC | #22
On 10/27/13 14:14, Florian Fainelli wrote:
> 2013/10/27 Jamal Hadi Salim <jhs@mojatatu.com>:
>> On 10/25/13 09:01, Felix Fietkau wrote:
>>>

>
> They are yes, the only "fancy" features these switches allow is
> basically to set a given's port vlan id, which is already a huge
> improvement compared to the vendor provided firmware.
>

Nice to know that you have something better than the vendor provided
stuff.

>
> The switch does have an address learning process which is usually not
> controlled by software at all, so yes, flooding is usually the way to
> get it to the CPU.
>

Ok.

> Which exact drivers are you refering to? If we are talking about DSA
> then yes, this is correct, but it is completely Ethernet MAC driver
> agnostic.
>

Sorry - cant point you to an exact one; one that i tried to convert to
NAPI and found these issues was from Netlogic (embedded 64 bit mips),
that i think now is in the kernel proper (and someone had converted to
NAPI as well). Let me get back to you with some sample examples..

> Why would we expose the hardware switch physical ports as netdevs if
> we cannot even any control over their data-path? Unlike these
> multiport NICs, the only traffic you see and you can control is the
> one from your CPU port.
>

Not necessarily for datapath, rather for control path. If i can
pull the stats, ifconfig up/down the port, set flow control
etc - then that is a  good reason to expose them.

>
> I do not really see how we could bend the existing interface (is it
> rtnetlink we are talking about or something else btw?) to expose these
> switches, maybe we could with iproute2, but still, the user-space
> interface/tool is far from being the problem here.
>

Look at the FDB API.
The user space interface as well as reusing kernel interfaces is my main
arguement.

> I don't think at any point in this discussion there was a mention that
> we do not want to change the user or kernel interface in OpenWrt
> because we have been using this for the past 5 years, on the contrary,
> if we are bringing this to a wide audience, this is to get some proper
> review and eventually change it.
>

Ok, sorry - I misinterpreted you and Felix. Like i said, if you gave me
that reason I would understand.

cheers,
jamal




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Oct. 28, 2013, 10:53 p.m. UTC | #23
On 10/27/13 15:51, Felix Fietkau wrote:
> On 2013-10-27 6:19 PM, Jamal Hadi Salim wrote:

> That question does not make any sense to me. Aside from low level
> control frames like pause frames for flow control, the switch has no
> need to send packets to the CPU port on its own.
> Remember what I told you about the switch being a *separate* entity from
> the NIC that connects it to the CPU.
>

I am assuming there is a MAC address which is identified to be that of a
switch. Something responds to ARP for example for that MAC. I think
you are  saying that for a certain class of switch chips, there is no
concept of "cpu port" - therefore there cannot be unicast from the
chip to the cpu.

> DSA does this, and last time I looked, it pushes *all* bridge traffic
> through the CPU, making it completely unusable for slower embedded CPUs.
>

I wasnt thinking DSA (rather some MIPS based embedded boards)- but now
that you bring it up, lets Cc Lennert.

> If I remember correctly, adding support 'bridge acceleration' was left
> as an exercise for the reader and never actually implemented.
>

 From talking to you, I realize there are things that are dumb and
cant be "accelerated". The scenarios so far have been for accelaration
(or to be correct: offloading).
And my contention is - this is a matter of capability discovery as
advertised by the driver and as used by the user tools.

> Sure, this could be fixed somehow, but even then the model and
> assumptions that DSA is built on simply don't work for some of the
> dumber switches that we support.
>

Agreed.

[.. content removed for brevity, dont think we have disagreements ..]


> One of the currently very common switches in many embedded devices is
> the RTL8366/RTL8367. It has some flexibility when it comes to
> configuring VLANs, and it's one of the few ones where you can configure
> a forwarding table for a VLAN (which spans multiple ports), which allows
> software bridging between multiple VLANs.
> However, what this switch does *not* support is adding a header/trailer
> to packets to indicate the originating port.
> This means that all per-port netdevs will be dummy ports which don't
> include the data path.
>

My view is that netdevs are still valuable even if only they get used 
for control path. Like you said earlier - you can still pull stats, flow 
control messages still make it through etc. They provide you
the consistent api to configure the switch above, ex:
If i was to use the FDB api for this switch as long as i can
abstract it in software as a bridge, I could send it a switch config
via its ops which says:
"I am giving you this entry with vland 400 for port 2, but i want you to
send it to the hardware not to your local entry"

> So let's say you have a configuration where you're using VLAN ID 4 on
> port 1, and you want to bridge it to VLAN ID 400 on port 2.
>
> Sounds easy enough, you can easily create a bridge that spans port1.4
> and port2.400. Except, this particular switch (like pretty much any
> other switch supported by swconfig) isn't actually able to handle such a
> configuration on its own.

Makes sense.
Let me point that even the Linux bridge cant handle this on its own
either.
You would need two bridges instantiated. The "cpu port" (we should call
it the "L3 port" really) is implicit in the case of the bridge i.e it
is the Linux network stack.
You would need to set the vlan filters on the bridge to strip the vlan
on egress of the first bridge etc ..

> It needs two VLAN configurations, with different forwarding table IDs,
> and then the software bridge on the CPU port needs to forward between
> the two different VLANs.
> To be able to handle such a configuration, the code would have to detect
> this kind of special case scenario, somehow hook itself via rx handler
> into the NIC connected to the CPU port and emulate that VLAN ID
> replacement behavior.
>


IMO: You dont need to muck with rx handler if you used bridge
abstraction. It becomes a config issue.

> With swconfig, you create two VLANs: VLAN 4, containing CPU and port1;
> VLAN 400, containing CPU and port2. You then create a software bridge
> between eth0.4 and eth0.400 (assuming eth0 is the NIC connected to the
> switch).
>

Can we call that "L3" instead of software bridge?
It can be done if you create a Linux bridge in software per L2 table id
in your chip. Then you attach the bridge ports.
A linux bridge of the sort, assuming there's a subnet per bridge is
configured thus:
bridge-tab1: link ports {eth0:vlan4, eth1:vlan4}, subnet 1
bridge-tab2: link ports {eth0:vlan400, eth1:vlan400}, subnet 2


> In a different scenario, the code would also have to detect
> configurations that the switch isn't able to handle, e.g.: bridging
> port1.4 to eth1 and port2.4 to eth2.
> Such a configuration wouldn't work at all with such a switch, because
> the CPU isn't able to tell apart traffic from port1 and port2, and
> there's no way to tell the switch that port1.4 and port2.4 should not be
> connected to each other, but both should go to the CPU.
>

Understood.
I think that discovery is a must - so you can apply different behavior
to different switches.
But you seem to have solved this already. Linux as is does not.
You can either have the driver tell you what it can/cant do or you
can attempt to fire and miss and get a return code that will tell
you that it cant achieve what you are asking it to do. I prefer the
former.

 >
> Those are just two simple scenarios from the top of my head - I'm pretty
> sure I could come up with a long list of further corner cases and
> quirks, which are simply either difficult to deal with, or completely
> unnatural in the model that you're describing.
>

I think these are the kind of things that need to be enumerated to come
to some conclusion.

> Trying to make all of these cases work in the code will make the whole
> thing a lot more difficult to deal with and maintain. It will also make
> it much harder for the user to figure out, what configurations work, and
> what configurations don't.
>
>
> Especially the case with reusing VLANs on different ports (but not
> connecting them to each other) is something that can easily work with
> software devices, but cannot be emulated on most embedded device
> switches. The software bridge configuration model raises a lot of
> expectations that these switches simply cannot meet.
>

I wouldnt expect every thing a software bridge does would be met by
a random switch.S/w bridge would be the super-set. But this is
not a new concept, example: Netdev itself is an abstraction - we have
USB, ethernet, wireless, variety of virtual interfaces etc.
Sometimes we dont even have the concept of a "link" in some of these
devices; infiniband would have a huge MAC address but i can still
use ifconfig on it etc.

> If you look at the swconfig model, you will see that the abstraction
> clearly communicates the limitations of these typical switches.
>

I will have to go back and look - but like i said earlier seems to me
you have solved this problem. Of the switch hardware i am familiar with
(high end pricey stuff), the capabilities tend to fall into the
following components:
-flooding control (i.e what should happen on destination failure)
-learning control (i.e what should happen on the source lookup failure)
(Ive seen knobs for "drop", "send to portX" where "X" could be cpu etc)
-fdb capacity
-whether it can do vlans, filtering pvids etc
-multicast snooping capability

To add to the above a few more based on talking to you:
- cpu port (in what ive come across this is always present, but
as you point out this cannot be assumed)
- ingress port tag (you point out that some cases this may never be
present even when the cpu port is present)
- ive never seen table id, but i think this is another one; in which
case the number of table ids becomes something one needs to discover..

cheers,
jamal

> The configuration model simply doesn't even let you express these kinds
> of unsuppported configurations that seem normal in the tools used to set
> up software bridges/vlans.
> At the same time, it's fairly consistent across the range of different
> chips that we have drivers for. That certainly leaves a much smaller
> amount of traps and surprises for users, compared to trying to emulate
> the software bridge model by hacking through the layers.
>
> Hopefully this will clear a few things up for you.
>
> - Felix
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Oct. 29, 2013, 9:34 a.m. UTC | #24
On 2013-10-28 23:53, Jamal Hadi Salim wrote:
> On 10/27/13 15:51, Felix Fietkau wrote:
>> On 2013-10-27 6:19 PM, Jamal Hadi Salim wrote:
> 
>> That question does not make any sense to me. Aside from low level
>> control frames like pause frames for flow control, the switch has no
>> need to send packets to the CPU port on its own.
>> Remember what I told you about the switch being a *separate* entity from
>> the NIC that connects it to the CPU.
>>
> 
> I am assuming there is a MAC address which is identified to be that of a
> switch. Something responds to ARP for example for that MAC. I think
> you are  saying that for a certain class of switch chips, there is no
> concept of "cpu port" - therefore there cannot be unicast from the
> chip to the cpu.
These are simple switches, why would they respond to ARP?
I suspect that you're attributing too much functionality to the switch
itself. Think of it as a device similar to the cheap unmanaged ones you
can buy in a shop and hook up to your machine via Ethernet.
Add to that some very limited VLAN grouping functionality, and you're
pretty close to the limits of what these switches can do.
They don't do ARP, IP or other things. They learn about MAC addresses
from incoming packets to build their forwarding path.
The CPU port in this case is whatever port on the switch that you plug
the cable of your machine into :)

>> One of the currently very common switches in many embedded devices is
>> the RTL8366/RTL8367. It has some flexibility when it comes to
>> configuring VLANs, and it's one of the few ones where you can configure
>> a forwarding table for a VLAN (which spans multiple ports), which allows
>> software bridging between multiple VLANs.
>> However, what this switch does *not* support is adding a header/trailer
>> to packets to indicate the originating port.
>> This means that all per-port netdevs will be dummy ports which don't
>> include the data path.
> 
> My view is that netdevs are still valuable even if only they get used 
> for control path. Like you said earlier - you can still pull stats, flow 
> control messages still make it through etc. They provide you
> the consistent api to configure the switch above, ex:
> If i was to use the FDB api for this switch as long as i can
> abstract it in software as a bridge, I could send it a switch config
> via its ops which says:
> "I am giving you this entry with vland 400 for port 2, but i want you to
> send it to the hardware not to your local entry"
The FDB related abstraction that you're describing will not work with
the hardware that I'm talking about. Let's leave that one out of this
discussion.
As for per-port netdevs: Yes, you could pull stats.
No, flow control messages would not make it through.
No idea how it would provide a *consistent* API.
Either way, if adding netdevs just for stats and link state, that could
be easily added on top of swconfig (or whatever name we pick for it)
later. I just don't think it's worth it at this point.

>> So let's say you have a configuration where you're using VLAN ID 4 on
>> port 1, and you want to bridge it to VLAN ID 400 on port 2.
>>
>> Sounds easy enough, you can easily create a bridge that spans port1.4
>> and port2.400. Except, this particular switch (like pretty much any
>> other switch supported by swconfig) isn't actually able to handle such a
>> configuration on its own.
> 
> Makes sense.
> Let me point that even the Linux bridge cant handle this on its own
> either.
> You would need two bridges instantiated. The "cpu port" (we should call
> it the "L3 port" really) is implicit in the case of the bridge i.e it
> is the Linux network stack.
> You would need to set the vlan filters on the bridge to strip the vlan
> on egress of the first bridge etc ..
> 
>> It needs two VLAN configurations, with different forwarding table IDs,
>> and then the software bridge on the CPU port needs to forward between
>> the two different VLANs.
>> To be able to handle such a configuration, the code would have to detect
>> this kind of special case scenario, somehow hook itself via rx handler
>> into the NIC connected to the CPU port and emulate that VLAN ID
>> replacement behavior.
> 
> IMO: You dont need to muck with rx handler if you used bridge
> abstraction. It becomes a config issue.
If we don't need to muck with an rx handler, how are packets intercepted
from the NIC that connects to the switch?
That NIC is run by a driver that knows nothing about switch stuff.

>> With swconfig, you create two VLANs: VLAN 4, containing CPU and port1;
>> VLAN 400, containing CPU and port2. You then create a software bridge
>> between eth0.4 and eth0.400 (assuming eth0 is the NIC connected to the
>> switch).
> 
> Can we call that "L3" instead of software bridge?
L3? Why?

> Understood.
> I think that discovery is a must - so you can apply different behavior
> to different switches.
> But you seem to have solved this already. Linux as is does not.
> You can either have the driver tell you what it can/cant do or you
> can attempt to fire and miss and get a return code that will tell
> you that it cant achieve what you are asking it to do. I prefer the
> former.
I think that's way more confusing to users than presenting a consistent
model that properly reflects what you can do with the hardware.

But I sense a pattern here. I've long had my beef with quite a few Linux
network related APIs for being inconsistent, having no decent error
reporting when you're trying to configure things (errno doesn't count,
it's just too ambiguous), and just making it hard to figure out the
capabilities. Of course, none of this can be easily fixed due to ABI
stability constraints.
I do NOT wish to follow that pattern!

>> Those are just two simple scenarios from the top of my head - I'm pretty
>> sure I could come up with a long list of further corner cases and
>> quirks, which are simply either difficult to deal with, or completely
>> unnatural in the model that you're describing.
> I think these are the kind of things that need to be enumerated to come
> to some conclusion.
I'm not going to try to enumerate all the case; I have other projects
that I need to work on. :)

>> Trying to make all of these cases work in the code will make the whole
>> thing a lot more difficult to deal with and maintain. It will also make
>> it much harder for the user to figure out, what configurations work, and
>> what configurations don't.
>>
>>
>> Especially the case with reusing VLANs on different ports (but not
>> connecting them to each other) is something that can easily work with
>> software devices, but cannot be emulated on most embedded device
>> switches. The software bridge configuration model raises a lot of
>> expectations that these switches simply cannot meet.
> I wouldnt expect every thing a software bridge does would be met by
> a random switch.S/w bridge would be the super-set. But this is
> not a new concept, example: Netdev itself is an abstraction - we have
> USB, ethernet, wireless, variety of virtual interfaces etc.
> Sometimes we dont even have the concept of a "link" in some of these
> devices; infiniband would have a huge MAC address but i can still
> use ifconfig on it etc.
Only a *tiny* part of the software bridge configuration model can be
emulated, the rest does not fit and has to be handled through extensions
or different APIs anyway. That's why I am convinced that it's a really
bad model to try to make these switches fit into it.

You gain a tiny advantage with writing scripts, but at the same time,
the code gets more complex, the configuration interface gets more
confusing, there are more nasty corner cases to take care of.
Why do you insist on making so many things worse just for one tiny
advantage? Where's the pragmatic cost/benefit tradeoff?

>> If you look at the swconfig model, you will see that the abstraction
>> clearly communicates the limitations of these typical switches.
>>
> 
> I will have to go back and look - but like i said earlier seems to me
> you have solved this problem. Of the switch hardware i am familiar with
> (high end pricey stuff), the capabilities tend to fall into the
> following components:
> -flooding control (i.e what should happen on destination failure)
> -learning control (i.e what should happen on the source lookup failure)
> (Ive seen knobs for "drop", "send to portX" where "X" could be cpu etc)
> -fdb capacity
> -whether it can do vlans, filtering pvids etc
> -multicast snooping capability
Right, with most of the switches that we support, almost none of these
things work in a way that can be integrated with the network stack.

> To add to the above a few more based on talking to you:
> - cpu port (in what ive come across this is always present, but
> as you point out this cannot be assumed)
I'm not even sure what you mean when you say 'cpu port cannot be
assumed'. On pretty much all devices that we work with, one of the ports
connects to a NIC in the CPU. It's just that the switch cannot be
assumed to have special treatment for that CPU port. As far as it is
concerned, it is just another port like the others.

> - ingress port tag (you point out that some cases this may never be
> present even when the cpu port is present)
> - ive never seen table id, but i think this is another one; in which
> case the number of table ids becomes something one needs to discover..
Yes, and this is something that doesn't even map directly to something
in the software bridge world.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maxime Bizon Oct. 29, 2013, 11:12 p.m. UTC | #25
On Wed, 2013-10-23 at 08:53 -0400, Jamal Hadi Salim wrote:

> So exposing the 5-8 ports as netdevs would be useful. Giving access to
> their stats through per-port netdevs etc. i.e a switch/bridge will 

While the intent is to make it look familiar to users, IMO this breaks
the rule of the least surprise.

From a user POV, when you see a netdevice, you expect to be able to
receive or send packets from/to it. The ability to read stats/link is
only a secondary feature.

Wireless subsystem moved away from using dummy/additional netdevices
because it caused confusion.

multiqueue devices forced us to separate struct netdevice and struct
netdev_queue, maybe it's time for more surgery :)
Jamal Hadi Salim Oct. 30, 2013, 11:45 a.m. UTC | #26
On 10/29/13 05:34, Felix Fietkau wrote:
> On 2013-10-28 23:53, Jamal Hadi Salim wrote:

>
> These are simple switches, why would they respond to ARP?
> I suspect that you're attributing too much functionality to the switch
> itself. Think of it as a device similar to the cheap unmanaged ones you
> can buy in a shop and hook up to your machine via Ethernet.
> Add to that some very limited VLAN grouping functionality, and you're
> pretty close to the limits of what these switches can do.
> They don't do ARP, IP or other things. They learn about MAC addresses
> from incoming packets to build their forwarding path.
> The CPU port in this case is whatever port on the switch that you plug
> the cable of your machine into :)

Ok, got it - the only use for cpu for these things is to retrieve things
like stats, link state, etc; can you even read the fdb?


> The FDB related abstraction that you're describing will not work with
> the hardware that I'm talking about. Let's leave that one out of this
> discussion.

sigh - ok. But you gotta help me understand why.

> As for per-port netdevs: Yes, you could pull stats.
> No, flow control messages would not make it through.
> No idea how it would provide a *consistent* API.
> Either way, if adding netdevs just for stats and link state, that could
> be easily added on top of swconfig (or whatever name we pick for it)
> later. I just don't think it's worth it at this point.
>

Ok, progress, lets leave this one out.

>> Can we call that "L3" instead of software bridge?
> L3? Why?

We have two L2 domains. You want to connect them - you need a higher
layer; Layer 3 seems to be the simple one (i.e typically people would
use ip to link two layer 2 broadcast domains).


> I think that's way more confusing to users than presenting a consistent
> model that properly reflects what you can do with the hardware.
>

I think discovery from a control view is always a win.

> But I sense a pattern here. I've long had my beef with quite a few Linux
> network related APIs for being inconsistent, having no decent error
> reporting when you're trying to configure things (errno doesn't count,
> it's just too ambiguous), and just making it hard to figure out the
> capabilities. Of course, none of this can be easily fixed due to ABI
> stability constraints.
> I do NOT wish to follow that pattern!
>

You are preaching to the choir. The whole errno 8 bit thing is a mess;
I used to printk things in the kernel to indicate granularity of
which EINVAL i was returning (but i was shot down); one suggestion is
to also include a string description on the error. But that is a side
issue.
So, nod. Discovery of capabilities is better - you still have to defer
to error codes when all else fails.


> I'm not going to try to enumerate all the case; I have other projects
> that I need to work on. :)
>

I understand. I am busy as well, just saying if we need to reach an
agreement to either agree or disagree we need to capture the esoterics
of the different cases; as you can see i tried to enumerate some in
my previous email. In my case this would be useful to see, using current
mechanisms, that it can or cant be done or can be done with mods etc.

> Only a *tiny* part of the software bridge configuration model can be
> emulated, the rest does not fit and has to be handled through extensions
> or different APIs anyway. That's why I am convinced that it's a really
> bad model to try to make these switches fit into it.
>
> You gain a tiny advantage with writing scripts, but at the same time,
> the code gets more complex, the configuration interface gets more
> confusing, there are more nasty corner cases to take care of.
> Why do you insist on making so many things worse just for one tiny
> advantage? Where's the pragmatic cost/benefit tradeoff?
>

There is nothing wrong with making extensions if they make sense. My
problem so far in this discussion is i havent figured which will be bad
extensions you bring up. My approach is to list things and
then point out which one will require some witchcraft on top of
current interfaces. I am afraid I am still missing that part. Maybe
I have to go back and study your patch some more.

> Right, with most of the switches that we support, almost none of these
> things work in a way that can be integrated with the network stack.
>

Good to know. These are useful components for slightly higher end
switches.


> I'm not even sure what you mean when you say 'cpu port cannot be
> assumed'.

Meant for other devices which are dumb - lets move past this point.

> On pretty much all devices that we work with, one of the ports
> connects to a NIC in the CPU. It's just that the switch cannot be
> assumed to have special treatment for that CPU port. As far as it is
> concerned, it is just another port like the others.
>

Aha. I think i see a small terminology cross-talk. You refer to things
as NICs when i use the term netdev. So now i understand better what you
mean by rx handler (I intepreted earlier to mean something at the tap
level). Ok, so Felix, for the case where we have switches with cpu ports
that can tag incoming packets with ingress port ids - can we say the
NIC rx handler is reasonable to be used as a demux point for the
software version of the ports? I am not talking about the corner
cases.

>> - ive never seen table id, but i think this is another one; in which
>> case the number of table ids becomes something one needs to discover..
> Yes, and this is something that doesn't even map directly to something
> in the software bridge world.
>

It does - There is a single table per bridge on the software bridge
world. You need multiple bridges, one per id.

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Oct. 30, 2013, 11:50 a.m. UTC | #27
On 10/29/13 19:12, Maxime Bizon wrote:

>
>  From a user POV, when you see a netdevice, you expect to be able to
> receive or send packets from/to it. The ability to read stats/link is
> only a secondary feature.
>

The important part is all the APIs stay consistent. I can use
same netlink calls. ifconfig works.
iproute2 works. People have written books on this stuff - we dont
have MCSE(Must Call Software Engineer) certification, but this is
as close as it gets. i.e the knowledge has been commoditized, even
my kid knows how to use these tools.

If i can get stats by doing ifconfig - that should provide illusion that
the netdevice is sending/receiving packets.

> Wireless subsystem moved away from using dummy/additional netdevices
> because it caused confusion.
>

This is a good arguement.
Can we hear a little more about this?

> multiqueue devices forced us to separate struct netdevice and struct
> netdev_queue, maybe it's time for more surgery :)
>

I think that would be a reasonable thing to do if it becomes necessary.

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Oct. 30, 2013, 11:58 a.m. UTC | #28
On 2013-10-30 12:50, Jamal Hadi Salim wrote:
> On 10/29/13 19:12, Maxime Bizon wrote:
> 
>>
>>  From a user POV, when you see a netdevice, you expect to be able to
>> receive or send packets from/to it. The ability to read stats/link is
>> only a secondary feature.
>>
> 
> The important part is all the APIs stay consistent. I can use
> same netlink calls. ifconfig works.
> iproute2 works. People have written books on this stuff - we dont
> have MCSE(Must Call Software Engineer) certification, but this is
> as close as it gets. i.e the knowledge has been commoditized, even
> my kid knows how to use these tools.
> 
> If i can get stats by doing ifconfig - that should provide illusion that
> the netdevice is sending/receiving packets.
Pretty much all of the above have serious limitations when you're not
actually able to run the data path through the per-port netdevs.
You can't assign IP addresses to them. The network stack will probably
even attempt to assign IPv6 link-local addresses to these things,
causing even more confusion.
You can't add them to normal software bridges like other devices.
You can't use bonding. I could probably go on for a while.
There's a huge list of things that you simply cannot do with these
interfaces, and without knowing the details of the implementation, users
will be left clueless as to why that is.
I'd say that's a very serious violation of the principle of least surprise.
And knowing what the typical OpenWrt users do with their devices, I can
already forsee the bogus bug reports trickling in, if this is to be
implemented.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Oct. 30, 2013, 12:53 p.m. UTC | #29
On 2013-10-30 12:45, Jamal Hadi Salim wrote:
> On 10/29/13 05:34, Felix Fietkau wrote:
>> On 2013-10-28 23:53, Jamal Hadi Salim wrote:
> 
>>
>> These are simple switches, why would they respond to ARP?
>> I suspect that you're attributing too much functionality to the switch
>> itself. Think of it as a device similar to the cheap unmanaged ones you
>> can buy in a shop and hook up to your machine via Ethernet.
>> Add to that some very limited VLAN grouping functionality, and you're
>> pretty close to the limits of what these switches can do.
>> They don't do ARP, IP or other things. They learn about MAC addresses
>> from incoming packets to build their forwarding path.
>> The CPU port in this case is whatever port on the switch that you plug
>> the cable of your machine into :)
> 
> Ok, got it - the only use for cpu for these things is to retrieve things
> like stats, link state, etc; can you even read the fdb?
Where supported, all you can typically read is a list of which MAC
address was discovered behind which port - if you're lucky. You usually
won't find VLAN information attached to that.
Often it simply isn't supported at all.

>> The FDB related abstraction that you're describing will not work with
>> the hardware that I'm talking about. Let's leave that one out of this
>> discussion.
> 
> sigh - ok. But you gotta help me understand why.
The hardware implementation of MAC address handling isn't even
consistent across chips from different vendors. Often you don't even get
things like the VLAN ID. Sometimes there's a global forwarding table,
sometimes you can have multiple tables and assign them to VLANs.

>>> Can we call that "L3" instead of software bridge?
>> L3? Why?
> 
> We have two L2 domains. You want to connect them - you need a higher
> layer; Layer 3 seems to be the simple one (i.e typically people would
> use ip to link two layer 2 broadcast domains).
If you connect two L2 domains through a bridge, I still consider that L2
- it's still on the same layer, just goes through more hops.

>> I think that's way more confusing to users than presenting a consistent
>> model that properly reflects what you can do with the hardware.
> I think discovery from a control view is always a win.
Yes, and swconfig handles the discovery part fairly well.

>> I'm not going to try to enumerate all the case; I have other projects
>> that I need to work on. :)
> 
> I understand. I am busy as well, just saying if we need to reach an
> agreement to either agree or disagree we need to capture the esoterics
> of the different cases; as you can see i tried to enumerate some in
> my previous email. In my case this would be useful to see, using current
> mechanisms, that it can or cant be done or can be done with mods etc.
At this point, I'm not sure if we will be able to reach an agreement. I
think I've shown over and over again that what you're proposing comes
with huge costs in terms of complexity and bloat, as demonstrated by the
fact that it adds so many corner cases that would have to be dealt with,
including many for which we haven't even the slightest idea of a good
solution.
Now, to make this a viable option, the benefits would have to be big and
significant enough to offset these costs.
The only real benefit you've pointed out so far is to be able to reuse
existing tools/APIs (but only with modifications, not as-is). I think
that's fairly small, when put in perspective with the hard problems that
this approach creates, both for users (hidden traps and surprises) and
for developers (implementation difficulties and incompatible abstractions).

>> Only a *tiny* part of the software bridge configuration model can be
>> emulated, the rest does not fit and has to be handled through extensions
>> or different APIs anyway. That's why I am convinced that it's a really
>> bad model to try to make these switches fit into it.
>>
>> You gain a tiny advantage with writing scripts, but at the same time,
>> the code gets more complex, the configuration interface gets more
>> confusing, there are more nasty corner cases to take care of.
>> Why do you insist on making so many things worse just for one tiny
>> advantage? Where's the pragmatic cost/benefit tradeoff?
>>
> 
> There is nothing wrong with making extensions if they make sense.
Yes, but if the basic abstraction doesn't make sense for the use case,
and it leads to too many corner cases, there's everything wrong with
trying to work around that through extensions.

> My problem so far in this discussion is i havent figured which will be bad
> extensions you bring up. My approach is to list things and
> then point out which one will require some witchcraft on top of
> current interfaces. I am afraid I am still missing that part. Maybe
> I have to go back and study your patch some more.
Sure, go ahead.

>> On pretty much all devices that we work with, one of the ports
>> connects to a NIC in the CPU. It's just that the switch cannot be
>> assumed to have special treatment for that CPU port. As far as it is
>> concerned, it is just another port like the others.
> 
> Aha. I think i see a small terminology cross-talk. You refer to things
> as NICs when i use the term netdev. So now i understand better what you
> mean by rx handler (I intepreted earlier to mean something at the tap
> level). 
I only started using the term NIC to emphasize that it's not just a
netdev of the switch - it's a real Ethernet MAC (usually in the SoC),
with a separate driver that knows nothing about the switch.

> Ok, so Felix, for the case where we have switches with cpu ports
> that can tag incoming packets with ingress port ids - can we say the
> NIC rx handler is reasonable to be used as a demux point for the
> software version of the ports? I am not talking about the corner
> cases.
Yes, but when looking at the big picture, the switch being able to tag
incoming packets with the ingress port is a corner case!
Most switches that we work with aren't actually able to do that!
I want to have a decent baseline implementation that does not assume
this port tagging capability.

>>> - ive never seen table id, but i think this is another one; in which
>>> case the number of table ids becomes something one needs to discover..
>> Yes, and this is something that doesn't even map directly to something
>> in the software bridge world.
> It does - There is a single table per bridge on the software bridge
> world. You need multiple bridges, one per id.
Depends on which software bridge.
If I have two normal netdevs, eth0 and eth1, I can create eth0.4 and
bridge it to eth1.5. That's just one bridge.
I can't easily emulate that with fake per-port netdevs and a typical
switch supported by swconfig.
With just swconfig (no fake netdevs) switches that support these table
ids, I would need to have two VLANs in the switch (both connected to the
CPU port, each one getting a separate table id), and then one software
bridge between eth0.4 and eth0.5 (assuming eth0 connects to the switch).

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Maxime Bizon Oct. 30, 2013, 2:28 p.m. UTC | #30
On Wed, 2013-10-30 at 07:50 -0400, Jamal Hadi Salim wrote:

> The important part is all the APIs stay consistent. I can use
> same netlink calls. ifconfig works.
> iproute2 works. People have written books on this stuff - we dont

these books usually start by telling people to assign IP address to
interfaces, not applicable here.

> If i can get stats by doing ifconfig - that should provide illusion that
> the netdevice is sending/receiving packets.

4 separated netdevices looks like 4 ethernet segments to me, and nothing
will prevent me from setting a different ip network on each device.

ENOTSUPP cannot be returned by ndo_start_xmit, the ability for a
netdevice to be able to receive/send packet from host is IMO
fundamental.

> This is a good arguement.
> Can we hear a little more about this?

see this kind of old threads:

http://rt2x00.serialmonkey.com/phpBB/viewtopic.php?f=5&t=4378
http://www.linuxquestions.org/questions/linux-networking-3/what-is-wmaster0-728708/
http://forums.debian.net/viewtopic.php?p=219440

> I think that would be a reasonable thing to do if it becomes
> necessary.

with rough naming:

- struct netdevice
- struct netdev_queue
- struct network_port (something to call ethtool on)
- struct bridge_dev (something you create/destroy vlan on, control FDB)
- struct bridge_port (something you set path cost on, ...)
- struct sw_bridge_dev (netdevice + underlying bridge_dev)
- struct sw_bridge_port (netdevice + underlying bridge_port)

old netdevice => (netdevice + netdev_queue * x + network_port)

ethtool works on netdevice or network_port

brctl addbr/addif creates sw_bridge_dev/sw_bridge_port, other commands
work on bridge_dev/bridge_port

drivers can register bridge_dev / bridge_port / network_port

simple case of a system with single ethernet mac & directly attached 4
ports switch:

netdevice: eth0
bridge_dev: hwbr0
bridge_port: hwbr0p0, hwbr0p1, hwbr0p2, hwbr0p3
network ports: eth0np0, hwbr0np0, hwbr0np1, hwbr0np2, hwbr0np3

ifconfig, ip link show only eth0
brctl show hwbr0
ethtool works on eth0 or eth0p0, hwbr0npX
Lennert Buytenhek Oct. 30, 2013, 5:27 p.m. UTC | #31
I didn't follow the rest of this thread, but..


On Mon, Oct 28, 2013 at 06:53:29PM -0400, Jamal Hadi Salim wrote:

> >That question does not make any sense to me. Aside from low level
> >control frames like pause frames for flow control, the switch has no
> >need to send packets to the CPU port on its own.

..a lot of people want to be able to do Spanning Tree, LLDP, 802.1x,
you name it, on their routers and access points, and that requires
that your CPU can send/receive packets to/from individual ports on
your switch chip.  In a lot of markets, your product is a non-starter
if it can't provide any or all of the above.  Excluding this entire
class of use cases _by software design_ is somewhat myopic and stupid.

(It's a different thing if your switch chip is dumb and can't actually
address individual ports, but then there's still no reason to impose
the same restrictions on your software design.)


> >DSA does this, and last time I looked, it pushes *all* bridge traffic
> >through the CPU, making it completely unusable for slower embedded CPUs.
>
> [...]
>
> >If I remember correctly, adding support 'bridge acceleration' was left
> >as an exercise for the reader and never actually implemented.

This patch does exactly that:

	http://patchwork.ozlabs.org/patch/16578/

This patch is in production use in a couple of million DSL gateways,
as well as in a bunch of airplane in-flight entertainment systems, so
by all means I would say that it works rather well.

If there is renewed interest in having such functionality upstream,
I would be happy to update the patch and submit it for inclusion.


> >Sure, this could be fixed somehow, but even then the model and
> >assumptions that DSA is built on simply don't work for some of the
> >dumber switches that we support.

What model and assumptions would those be?


> >One of the currently very common switches in many embedded devices is
> >the RTL8366/RTL8367. It has some flexibility when it comes to
> >configuring VLANs, and it's one of the few ones where you can configure
> >a forwarding table for a VLAN (which spans multiple ports), which allows
> >software bridging between multiple VLANs.
> >However, what this switch does *not* support is adding a header/trailer
> >to packets to indicate the originating port.

The ingress/egress port doesn't _have_ to be conveyed in the data
packets themselves.

From a quick look at the RTL8366 datasheet, you can control the
egress port by creating a temporary MAC address table entry (this
seems to work both for unicast and multicast packets).

Admittedly, I didn't have a very thorough look at the datasheet,
but it also mentions the Spanning Tree protocol, and contains this
remark related to receiving BPDUs: "The CPU port should carry the
ingress port number of the receiving BPDU.".  If this switch chip
can't do per-port addressing, how can it actually ever speak STP
at all?  Is the datasheet just lying about this?


> >This means that all per-port netdevs will be dummy ports which don't
> >include the data path.

And I think that's fine.

Look, even if you're not going to address data traffic to individual
ports on your switch chip, there's still a plethora of per-port
operations that you want to be able to do: administratively setting
the link state on ports up and down, controlling autonegotiation and
other PHY settings on individual ports, etc.

You can either let the administrator do this with the standard ifconfig
/ ip link / ethtool tools, or you can make up a parallel API and
corresponding set of userland tools to duplicate most of the existing
functionality -- I know which option I prefer.

Presenting each switch port as an individual Linux netdevice to the OS
is an orthogonal decision to actually using those netdevices for data
traffic, and conflating the two by arguing that you need special tools
to do per-port operations for the sole reason that your switch chip
cannot address individual ports is a rather confused argument.


> My view is that netdevs are still valuable even if only they get
> used for control path. Like you said earlier - you can still pull
> stats, flow control messages still make it through etc. They provide
> you
> the consistent api to configure the switch above, ex:
> If i was to use the FDB api for this switch as long as i can
> abstract it in software as a bridge, I could send it a switch config
> via its ops which says:
> "I am giving you this entry with vland 400 for port 2, but i want you to
> send it to the hardware not to your local entry"

Fully agreed on this.


> >So let's say you have a configuration where you're using VLAN ID 4 on
> >port 1, and you want to bridge it to VLAN ID 400 on port 2.
> >
> >Sounds easy enough, you can easily create a bridge that spans port1.4
> >and port2.400. Except, this particular switch (like pretty much any
> >other switch supported by swconfig) isn't actually able to handle such a
> >configuration on its own.

Neither can DSA switch chips.

You can always find things that Linux can do that your switch chip
cannot (e.g. stateful firewalling between ports), and that isn't much
of an argument for or against anything.


> >With swconfig, you create two VLANs: VLAN 4, containing CPU and port1;
> >VLAN 400, containing CPU and port2. You then create a software bridge
> >between eth0.4 and eth0.400 (assuming eth0 is the NIC connected to the
> >switch).

With DSA, you would bridge between port1.4 and port2.400.  I'm still
not sure what your argument is arguing for or against.


> >In a different scenario, the code would also have to detect
> >configurations that the switch isn't able to handle, e.g.: bridging
> >port1.4 to eth1 and port2.4 to eth2.
> >Such a configuration wouldn't work at all with such a switch, because
> >the CPU isn't able to tell apart traffic from port1 and port2, and
> >there's no way to tell the switch that port1.4 and port2.4 should not be
> >connected to each other, but both should go to the CPU.

And it's quite easy to detect what your switch chip can do and offload
that part to the hardware, and keep doing the rest in software.


> >Trying to make all of these cases work in the code will make the whole
> >thing a lot more difficult to deal with and maintain. It will also make
> >it much harder for the user to figure out, what configurations work, and
> >what configurations don't.

It's actually quite easy, and certainly a lot less total effort than
forcing all of your users to learn a new set of userland tools (unless
you're not aiming to ever have a lot of users, that is..).


thanks,
Lennert
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lennert Buytenhek Oct. 30, 2013, 5:34 p.m. UTC | #32
On Wed, Oct 30, 2013 at 06:27:56PM +0100, Lennert Buytenhek wrote:

> > >This means that all per-port netdevs will be dummy ports which don't
> > >include the data path.
> 
> And I think that's fine.
> 
> Look, even if you're not going to address data traffic to individual
> ports on your switch chip, there's still a plethora of per-port
> operations that you want to be able to do: administratively setting
> the link state on ports up and down, controlling autonegotiation and
> other PHY settings on individual ports, etc.
> 
> You can either let the administrator do this with the standard ifconfig
> / ip link / ethtool tools, or you can make up a parallel API and
> corresponding set of userland tools to duplicate most of the existing
> functionality -- I know which option I prefer.
> 
> Presenting each switch port as an individual Linux netdevice to the OS
> is an orthogonal decision to actually using those netdevices for data
> traffic, and conflating the two by arguing that you need special tools
> to do per-port operations for the sole reason that your switch chip
> cannot address individual ports is a rather confused argument.

Forgot to add: there's a patch for net/dsa that adds exactly such an
option.  We called it 'unmanaged' mode, and it doesn't enable packet
tagging on the CPU<->switch chip interface, so that data only ever
flows over a single network interface ("eth0"), while the other
("dummy") network interfaces ("port1", "port2", etc) are used for
setting link state with ip link, setting PHY settings with ethtool,
getting ethtool statistics, etc, with 100% unmodified userland tools.
This patch is currently buried inside a vendor tree, but I'd be happy
to dig it out and submit it.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 30, 2013, 5:56 p.m. UTC | #33
On 10/30/2013 10:34 AM, Lennert Buytenhek wrote:
> On Wed, Oct 30, 2013 at 06:27:56PM +0100, Lennert Buytenhek wrote:
>
>>>> This means that all per-port netdevs will be dummy ports which don't
>>>> include the data path.
>>
>> And I think that's fine.
>>
>> Look, even if you're not going to address data traffic to individual
>> ports on your switch chip, there's still a plethora of per-port
>> operations that you want to be able to do: administratively setting
>> the link state on ports up and down, controlling autonegotiation and
>> other PHY settings on individual ports, etc.
>>
>> You can either let the administrator do this with the standard ifconfig
>> / ip link / ethtool tools, or you can make up a parallel API and
>> corresponding set of userland tools to duplicate most of the existing
>> functionality -- I know which option I prefer.
>>
>> Presenting each switch port as an individual Linux netdevice to the OS
>> is an orthogonal decision to actually using those netdevices for data
>> traffic, and conflating the two by arguing that you need special tools
>> to do per-port operations for the sole reason that your switch chip
>> cannot address individual ports is a rather confused argument.
>
> Forgot to add: there's a patch for net/dsa that adds exactly such an
> option.  We called it 'unmanaged' mode, and it doesn't enable packet
> tagging on the CPU<->switch chip interface, so that data only ever
> flows over a single network interface ("eth0"), while the other
> ("dummy") network interfaces ("port1", "port2", etc) are used for
> setting link state with ip link, setting PHY settings with ethtool,
> getting ethtool statistics, etc, with 100% unmodified userland tools.
> This patch is currently buried inside a vendor tree, but I'd be happy
> to dig it out and submit it.
>

A "dummy" network interface is something I've been thinking about
for SR-IOV as well. In the SR-IOV case we have an embedded bridge
in the hardware but the virtual functions may be direct assigned
to a guest and not visible to the host.

It would be easier to manage the ports and assign them to different
bridge/QOS objects (OVS, bridge, nftables) if the ports were visible
and manageable in the host even though there is no data path. Today
we special ndo ops that only work for VFs but this is a bit clumsy
and gets more clumsy as the nic switch becomes more like a real switch.

.John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Fastabend Oct. 30, 2013, 5:56 p.m. UTC | #34
On 10/30/2013 10:34 AM, Lennert Buytenhek wrote:
> On Wed, Oct 30, 2013 at 06:27:56PM +0100, Lennert Buytenhek wrote:
>
>>>> This means that all per-port netdevs will be dummy ports which don't
>>>> include the data path.
>>
>> And I think that's fine.
>>
>> Look, even if you're not going to address data traffic to individual
>> ports on your switch chip, there's still a plethora of per-port
>> operations that you want to be able to do: administratively setting
>> the link state on ports up and down, controlling autonegotiation and
>> other PHY settings on individual ports, etc.
>>
>> You can either let the administrator do this with the standard ifconfig
>> / ip link / ethtool tools, or you can make up a parallel API and
>> corresponding set of userland tools to duplicate most of the existing
>> functionality -- I know which option I prefer.
>>
>> Presenting each switch port as an individual Linux netdevice to the OS
>> is an orthogonal decision to actually using those netdevices for data
>> traffic, and conflating the two by arguing that you need special tools
>> to do per-port operations for the sole reason that your switch chip
>> cannot address individual ports is a rather confused argument.
>
> Forgot to add: there's a patch for net/dsa that adds exactly such an
> option.  We called it 'unmanaged' mode, and it doesn't enable packet
> tagging on the CPU<->switch chip interface, so that data only ever
> flows over a single network interface ("eth0"), while the other
> ("dummy") network interfaces ("port1", "port2", etc) are used for
> setting link state with ip link, setting PHY settings with ethtool,
> getting ethtool statistics, etc, with 100% unmodified userland tools.
> This patch is currently buried inside a vendor tree, but I'd be happy
> to dig it out and submit it.
>

A "dummy" network interface is something I've been thinking about
for SR-IOV nics as well. In the SR-IOV case we have an embedded bridge
in the hardware but the virtual functions may be direct assigned
to a guest and not visible to the host.

It would be easier to manage the ports and assign them to different
bridge/QOS objects (OVS, bridge, nftables) if the ports were visible
and manageable in the host even though there is no data path. Today
we special ndo ops that only work for VFs but this is a bit clumsy
and gets more clumsy as the nic switch becomes more like a real switch.

.John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Felix Fietkau Oct. 30, 2013, 7:47 p.m. UTC | #35
On 2013-10-30 18:27, Lennert Buytenhek wrote:
> I didn't follow the rest of this thread, but..
> 
> 
> On Mon, Oct 28, 2013 at 06:53:29PM -0400, Jamal Hadi Salim wrote:
> 
>> >That question does not make any sense to me. Aside from low level
>> >control frames like pause frames for flow control, the switch has no
>> >need to send packets to the CPU port on its own.
> 
> ..a lot of people want to be able to do Spanning Tree, LLDP, 802.1x,
> you name it, on their routers and access points, and that requires
> that your CPU can send/receive packets to/from individual ports on
> your switch chip.  In a lot of markets, your product is a non-starter
> if it can't provide any or all of the above.  Excluding this entire
> class of use cases _by software design_ is somewhat myopic and stupid.
> 
> (It's a different thing if your switch chip is dumb and can't actually
> address individual ports, but then there's still no reason to impose
> the same restrictions on your software design.)
Many of the switches we support can't address individual ports via tags.
You usually just set up VLANs and let the switch do the rest.

>> >DSA does this, and last time I looked, it pushes *all* bridge traffic
>> >through the CPU, making it completely unusable for slower embedded CPUs.
>>
>> [...]
>>
>> >If I remember correctly, adding support 'bridge acceleration' was left
>> >as an exercise for the reader and never actually implemented.
> 
> This patch does exactly that:
> 
> 	http://patchwork.ozlabs.org/patch/16578/
> 
> This patch is in production use in a couple of million DSL gateways,
> as well as in a bunch of airplane in-flight entertainment systems, so
> by all means I would say that it works rather well.
> 
> If there is renewed interest in having such functionality upstream,
> I would be happy to update the patch and submit it for inclusion.
Yes, I would really like to see this merged. If we can somehow get the
bridge offload stuff to handle VLAN trunking as well, I'd be interested
in looking into DSA support for some Atheros switches that I've been
working with.

>> >Sure, this could be fixed somehow, but even then the model and
>> >assumptions that DSA is built on simply don't work for some of the
>> >dumber switches that we support.
> 
> What model and assumptions would those be?
The assumption of being able to address individual ports via tags.

>> >One of the currently very common switches in many embedded devices is
>> >the RTL8366/RTL8367. It has some flexibility when it comes to
>> >configuring VLANs, and it's one of the few ones where you can configure
>> >a forwarding table for a VLAN (which spans multiple ports), which allows
>> >software bridging between multiple VLANs.
>> >However, what this switch does *not* support is adding a header/trailer
>> >to packets to indicate the originating port.
> 
> The ingress/egress port doesn't _have_ to be conveyed in the data
> packets themselves.
> 
> From a quick look at the RTL8366 datasheet, you can control the
> egress port by creating a temporary MAC address table entry (this
> seems to work both for unicast and multicast packets).
Sounds nasty. I certainly wouldn't want this called from the data path,
since on some systems register access will be bit-banged over GPIO.

> Admittedly, I didn't have a very thorough look at the datasheet,
> but it also mentions the Spanning Tree protocol, and contains this
> remark related to receiving BPDUs: "The CPU port should carry the
> ingress port number of the receiving BPDU.".  If this switch chip
> can't do per-port addressing, how can it actually ever speak STP
> at all?  Is the datasheet just lying about this?
From what the datasheet, it looks like it expects the CPU to guess the
port based on the VLAN ID. This is RealTek, so it might just be
theoretically possible to do what the datasheet says, but quirky enough
to be unusable in practice.

>> >This means that all per-port netdevs will be dummy ports which don't
>> >include the data path.
> 
> And I think that's fine.
> 
> Look, even if you're not going to address data traffic to individual
> ports on your switch chip, there's still a plethora of per-port
> operations that you want to be able to do: administratively setting
> the link state on ports up and down, controlling autonegotiation and
> other PHY settings on individual ports, etc.
> 
> You can either let the administrator do this with the standard ifconfig
> / ip link / ethtool tools, or you can make up a parallel API and
> corresponding set of userland tools to duplicate most of the existing
> functionality -- I know which option I prefer.
> 
> Presenting each switch port as an individual Linux netdevice to the OS
> is an orthogonal decision to actually using those netdevices for data
> traffic, and conflating the two by arguing that you need special tools
> to do per-port operations for the sole reason that your switch chip
> cannot address individual ports is a rather confused argument.
The thing that most swconfig users in OpenWrt care about is being able
to group ports into VLANs, sometimes just to be able to split them into
LAN/WAN, sometimes to be able to use one port as a trunking port to
connect multiple networks (some of which may be on ports of the same
switch, some behind the CPU port).
I care about that part a lot more than messing around with the
individual ports.
If we can figure out a clean and simple way to support this well, even
on switches that are seriously limited wrt. individual port addressing
via the data path, I'd be more willing to consider it.
I still don't like the dummy netdev thing very much, because I know
enough users that will easily get confused by this, and with a separate
interface they at least know that there's a separate set of rules to it.
I don't think that's a confused argument.

>> >With swconfig, you create two VLANs: VLAN 4, containing CPU and port1;
>> >VLAN 400, containing CPU and port2. You then create a software bridge
>> >between eth0.4 and eth0.400 (assuming eth0 is the NIC connected to the
>> >switch).
> 
> With DSA, you would bridge between port1.4 and port2.400.  I'm still
> not sure what your argument is arguing for or against.
I'm saying most switches that we support cannot do DSA-style packet port
tagging for ingress/egress. That kind of setup can be done with some
software bridging when setting up VLAN tables appropriately, but I'm not
sure it's possible to emulate this if you're treating the switch as a
'bridge' and trying to do handle this via the FDB API, which is what we
were discussing earlier.

- Felix
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Fainelli Dec. 7, 2013, 1:45 a.m. UTC | #36
2013/10/30 Felix Fietkau <nbd@openwrt.org>:
>> With DSA, you would bridge between port1.4 and port2.400.  I'm still
>> not sure what your argument is arguing for or against.
> I'm saying most switches that we support cannot do DSA-style packet port
> tagging for ingress/egress. That kind of setup can be done with some
> software bridging when setting up VLAN tables appropriately, but I'm not
> sure it's possible to emulate this if you're treating the switch as a
> 'bridge' and trying to do handle this via the FDB API, which is what we
> were discussing earlier.

DSA works nicely because it has its own Ethertype, and also because
the switches are configurable enough to allow for the insertion of a
custom Ethertype for which you can register a corresponding
dev_add_pack() handler. Even on some switches which support custom
tags appended or prepended such as those from Broadcom, a tag is
usually inserted between ethe Ethernet Source Address and the
Ethertype, but this tag has variable fields. I could imagine that we
extend the dev_add_pack() to include a mask to perform the matching,
but this is going to add an "and" logical operation in a hot-path,
would that be acceptable?

And this only works when the tag is prepended, if it is appended, we
are kind of stuck because we do not really have any way to look for it
at the end of the frame but assume that is here/not here...
diff mbox

Patch

diff --git a/Documentation/networking/swconfig.txt b/Documentation/networking/swconfig.txt
new file mode 100644
index 0000000..f560066
--- /dev/null
+++ b/Documentation/networking/swconfig.txt
@@ -0,0 +1,162 @@ 
+Generic Netlink Switch configuration API
+
+Introduction
+============
+
+The following documentation covers the Linux Ethernet switch configuration API
+which is based on the Generic Netlink infrastructure.
+
+Scope and rationale
+===================
+
+Most Ethernet switches found in small routers are managed switches which allow
+the following operations:
+
+- configure a port to belong to a particular set of VLANs either as tagged or
+  untagged
+- configure a particular port to advertise specific link/speed/duplex settings
+- collect statistics about the number of packets/bytes transferred/received
+- any other vendor specific feature: rate limiting, single/double tagging...
+
+Such switches can be connected to the controlling CPU using different hardware
+busses, but most commonly:
+
+- SPI/I2C/GPIO bitbanging
+- MDIO
+- Memory mapped into the CPU register address space
+
+As of today the usual way to configure such a switch was either to write a
+specific driver or to write an user-space application which would have to know
+about the hardware differences and figure out a way to access the switch
+registers (spidev, SIOCIGGMIIREG, mmap...) from user-space.
+
+This has multiple issues:
+
+- proliferation of ad-hoc solutions to configure a switch both open source and
+  proprietary
+
+- absence of common software reference for switches commonly found on the market
+  (Broadcom, Lantiq/Infineon/ADMTek, Marvell, Qualcomm/Atheros...) which implies
+  a duplication effort for each implementer
+
+- inability to leverage existing hardware representation mechanisms such as
+  Device Tree (spidev, i2c-dev.. do not belong in Device Tree and rely on
+  Linux-specific "forwarder" drivers) to describe a switch device
+
+The goal of the switch configuration API is to provide a common basis to build
+re-usable and extensible switch drivers with the following ideas in mind:
+
+- having a central point of configuration on top of which a reference user-space
+  implementation can be provided but also allow for other user-space
+  implementations to exist
+
+- ensure the Linux kernel is in control of the actual hardware access
+
+- be extensible enough to support per-switch features without making the generic
+  implementation too heavy weighted and without making user-space changes each
+  and every time a new feature is added
+
+Based on these design goals the Generic Netlink kernel/user-space communication
+mechanism was chosen because it allows for all design goals to be met.
+
+Distributed Switch Architecture vs. swconfig
+============================================
+
+The Marvell Distributed Switch Architecture drivers is an existing solution
+which is a heavy switch driver infrastructure, is Marvell centric, only
+supports MDIO connected switches, mangles an Ethernet driver transmit/receive
+paths and does not offer a central control path for the user.
+
+swconfig is vendor agnostic, does not mangle the transmit/receive path
+of an Ethernet driver and is focused on the control path of the switch rather
+that the data path. It is based on Generic Netlink to allow for each switch
+driver to easily extend the swconfig API without causing major core parts rework
+each and every time someone has a specific feature to implement and offers a
+central configuration point with a well-defined API.
+
+Switch configuration API
+========================
+
+The main data structure of the switch configuration API is a "struct switch_dev"
+which contains the following members:
+
+- a set of common operations to all switches (struct switch_dev_ops)
+- a network device pointer it is physically attached to
+- a number of physical switch ports (including CPU port)
+- a set of configured vlans
+- a CPU specific port index
+
+A particular switch device is registered/unregistered using the following pair
+of functions:
+
+register_switch(struct switch_dev *sw_dev, struct net_device *dev);
+unregister_switch(struct switch_dev);
+
+A given switch driver can be backed by any kind of underlying bus driver (i2c
+client, GPIO driver, MMIO driver, directly into the Ethernet MAC driver...).
+
+The set of common operations to all switches is represented by the "struct
+switch_dev_ops" function pointers, these common operations are defined as such:
+
+- get the port list of a VLAN identifier
+- set the port list of a VLAN identifier
+- get the primary VLAN identifier of a port
+- set the primary VLAN identifier of a port
+- apply the changed configuration to the switch
+- reset the switch
+- get a port link status
+- get a port statistics counters
+
+The switch_dev_ops structure also contains an extensible way of representing and
+querying switch specific features, 3 different types of attributes are
+available:
+
+- global attributes: attributes global to a switch (name, identifier, number of
+  ports)
+- port attributes: per-port specific attributes (MIB counters, enabling port
+  mirroring...)
+- vlan attributes: per-VLAN specific attributes (VLAN id, specific VLAN
+  information)
+
+Each of these 3 categories must be represented using an array of "struct
+switch_attr" attributes. This structure must be filed with:
+
+- an unique name for the operation
+- a description for the operation
+- a setter operation
+- a getter operation
+- a data type (string, integer, port)
+- eventual min/max limits to validate user input data
+
+The "struct switch_attr" directly maps to a Generic Netlink type of command and
+will be automatically discovered by the "swconfig" user-space utility without
+requiring user-space changes.
+
+User-space reference tool
+=========================
+
+A reference user-space implementation is provided in tools/swconfig in order to
+directly configure and use a particular switch driver. This reference
+implementation is linking against libnl-1 for the moment.
+
+To build it:
+
+make -C tools/swconfig
+
+To list the available switches:
+
+./tools/swconfig list
+
+And to show a particular switch configuration for instance:
+
+./tools/swconfig dev eth0 show
+
+Fake (simulation) switch driver
+===============================
+
+A fake switch driver called swconfig-hwsim is provided in order to allow for
+easy testing of API changes and to perform regression testing. This driver will
+automatically map to the loopback device and will create a fake switch of up to
+8 Gigabit ports. Each of these ports can be configured with separate
+speed/duplex/link settings. This driver is gated with the CONFIG_SWCONFIG_HWSIM
+configuration symbol.
diff --git a/MAINTAINERS b/MAINTAINERS
index f169259..3a54262 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8117,6 +8117,16 @@  F:	lib/swiotlb.c
 F:	arch/*/kernel/pci-swiotlb.c
 F:	include/linux/swiotlb.h
 
+SWITCH CONFIGURATION API
+M:	Florian Fainelli <f.fainelli@gmail.com>
+L:	openwrt-devel@lists.openwrt.org
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	drivers/net/ethernet/phy/swconfig*.c
+F:	include/uapi/linux/switch.h
+F:	include/linux/switch.h
+F:	Documentation/networking/swconfig.txt
+
 SYNOPSYS ARC ARCHITECTURE
 M:	Vineet Gupta <vgupta@synopsys.com>
 S:	Supported
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 342561a..9b3e117 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -12,6 +12,12 @@  menuconfig PHYLIB
 
 if PHYLIB
 
+config SWCONFIG
+	tristate "Switch configuration API"
+	---help---
+	  Switch configuration API using netlink. This allows
+	  you to configure the VLAN features of certain switches.
+
 comment "MII PHY device drivers"
 
 config AT803X_PHY
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 23a2ab2..268c7de 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -3,6 +3,7 @@ 
 libphy-objs			:= phy.o phy_device.o mdio_bus.o
 
 obj-$(CONFIG_PHYLIB)		+= libphy.o
+obj-$(CONFIG_SWCONFIG)		+= swconfig.o
 obj-$(CONFIG_MARVELL_PHY)	+= marvell.o
 obj-$(CONFIG_DAVICOM_PHY)	+= davicom.o
 obj-$(CONFIG_CICADA_PHY)	+= cicada.o
diff --git a/drivers/net/phy/swconfig.c b/drivers/net/phy/swconfig.c
new file mode 100644
index 0000000..9997c35
--- /dev/null
+++ b/drivers/net/phy/swconfig.c
@@ -0,0 +1,1078 @@ 
+/*
+ * Switch configuration API
+ *
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/capability.h>
+#include <linux/skbuff.h>
+#include <linux/swconfig.h>
+
+#define SWCONFIG_DEVNAME	"switch%d"
+
+MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
+MODULE_LICENSE("GPL");
+
+static int swdev_id;
+static struct list_head swdevs;
+static DEFINE_SPINLOCK(swdevs_lock);
+struct swconfig_callback;
+
+struct swconfig_callback {
+	struct sk_buff *msg;
+	struct genlmsghdr *hdr;
+	struct genl_info *info;
+	int cmd;
+
+	/* callback for filling in the message data */
+	int (*fill)(struct swconfig_callback *cb, void *arg);
+
+	/* callback for closing the message before sending it */
+	int (*close)(struct swconfig_callback *cb, void *arg);
+
+	struct nlattr *nest[4];
+	int args[4];
+};
+
+/* defaults */
+
+static int
+swconfig_get_vlan_ports(struct switch_dev *dev,
+			const struct switch_attr *attr, struct switch_val *val)
+{
+	int ret;
+	if (val->port_vlan >= dev->vlans)
+		return -EINVAL;
+
+	if (!dev->ops->get_vlan_ports)
+		return -EOPNOTSUPP;
+
+	ret = dev->ops->get_vlan_ports(dev, val);
+	return ret;
+}
+
+static int
+swconfig_set_vlan_ports(struct switch_dev *dev,
+			const struct switch_attr *attr, struct switch_val *val)
+{
+	struct switch_port *ports = val->value.ports;
+	const struct switch_dev_ops *ops = dev->ops;
+	int i;
+
+	if (val->port_vlan >= dev->vlans)
+		return -EINVAL;
+
+	/* validate ports */
+	if (val->len > dev->ports)
+		return -EINVAL;
+
+	if (!ops->set_vlan_ports)
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < val->len; i++) {
+		if (ports[i].id >= dev->ports)
+			return -EINVAL;
+
+		if (ops->set_port_pvid &&
+		    !(ports[i].flags & (1 << SWITCH_PORT_FLAG_TAGGED)))
+			ops->set_port_pvid(dev, ports[i].id, val->port_vlan);
+	}
+
+	return ops->set_vlan_ports(dev, val);
+}
+
+static int
+swconfig_set_pvid(struct switch_dev *dev,
+			const struct switch_attr *attr, struct switch_val *val)
+{
+	if (val->port_vlan >= dev->ports)
+		return -EINVAL;
+
+	if (!dev->ops->set_port_pvid)
+		return -EOPNOTSUPP;
+
+	return dev->ops->set_port_pvid(dev, val->port_vlan, val->value.i);
+}
+
+static int
+swconfig_get_pvid(struct switch_dev *dev,
+			const struct switch_attr *attr, struct switch_val *val)
+{
+	if (val->port_vlan >= dev->ports)
+		return -EINVAL;
+
+	if (!dev->ops->get_port_pvid)
+		return -EOPNOTSUPP;
+
+	return dev->ops->get_port_pvid(dev, val->port_vlan, &val->value.i);
+}
+
+static const char *
+swconfig_speed_str(enum switch_port_speed speed)
+{
+	switch (speed) {
+	case SWITCH_PORT_SPEED_10:
+		return "10baseT";
+	case SWITCH_PORT_SPEED_100:
+		return "100baseT";
+	case SWITCH_PORT_SPEED_1000:
+		return "1000baseT";
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
+static int
+swconfig_get_link(struct switch_dev *dev,
+			const struct switch_attr *attr, struct switch_val *val)
+{
+	struct switch_port_link link;
+	int len;
+	int ret;
+
+	if (val->port_vlan >= dev->ports)
+		return -EINVAL;
+
+	if (!dev->ops->get_port_link)
+		return -EOPNOTSUPP;
+
+	memset(&link, 0, sizeof(link));
+	ret = dev->ops->get_port_link(dev, val->port_vlan, &link);
+	if (ret)
+		return ret;
+
+	memset(dev->buf, 0, sizeof(dev->buf));
+
+	if (link.link)
+		len = snprintf(dev->buf, sizeof(dev->buf),
+			       "port:%d link:up speed:%s %s-duplex %s%s%s",
+			       val->port_vlan,
+			       swconfig_speed_str(link.speed),
+			       link.duplex ? "full" : "half",
+			       link.tx_flow ? "txflow " : "",
+			       link.rx_flow ?	"rxflow " : "",
+			       link.aneg ? "auto" : "");
+	else
+		len = snprintf(dev->buf, sizeof(dev->buf), "port:%d link:down",
+			       val->port_vlan);
+
+	val->value.s = dev->buf;
+	val->len = len;
+
+	return 0;
+}
+
+static int
+swconfig_apply_config(struct switch_dev *dev,
+			const struct switch_attr *attr, struct switch_val *val)
+{
+	/* don't complain if not supported by the switch driver */
+	if (!dev->ops->apply_config)
+		return 0;
+
+	return dev->ops->apply_config(dev);
+}
+
+static int
+swconfig_reset_switch(struct switch_dev *dev,
+			const struct switch_attr *attr, struct switch_val *val)
+{
+	/* don't complain if not supported by the switch driver */
+	if (!dev->ops->reset_switch)
+		return 0;
+
+	return dev->ops->reset_switch(dev);
+}
+
+enum global_defaults {
+	GLOBAL_APPLY,
+	GLOBAL_RESET,
+};
+
+enum vlan_defaults {
+	VLAN_PORTS,
+};
+
+enum port_defaults {
+	PORT_PVID,
+	PORT_LINK,
+};
+
+static struct switch_attr default_global[] = {
+	[GLOBAL_APPLY] = {
+		.type = SWITCH_TYPE_NOVAL,
+		.name = "apply",
+		.description = "Activate changes in the hardware",
+		.set = swconfig_apply_config,
+	},
+	[GLOBAL_RESET] = {
+		.type = SWITCH_TYPE_NOVAL,
+		.name = "reset",
+		.description = "Reset the switch",
+		.set = swconfig_reset_switch,
+	}
+};
+
+static struct switch_attr default_port[] = {
+	[PORT_PVID] = {
+		.type = SWITCH_TYPE_INT,
+		.name = "pvid",
+		.description = "Primary VLAN ID",
+		.set = swconfig_set_pvid,
+		.get = swconfig_get_pvid,
+	},
+	[PORT_LINK] = {
+		.type = SWITCH_TYPE_STRING,
+		.name = "link",
+		.description = "Get port link information",
+		.set = NULL,
+		.get = swconfig_get_link,
+	}
+};
+
+static struct switch_attr default_vlan[] = {
+	[VLAN_PORTS] = {
+		.type = SWITCH_TYPE_PORTS,
+		.name = "ports",
+		.description = "VLAN port mapping",
+		.set = swconfig_set_vlan_ports,
+		.get = swconfig_get_vlan_ports,
+	},
+};
+
+static const struct switch_attr *
+swconfig_find_attr_by_name(const struct switch_attrlist *alist,
+				const char *name)
+{
+	int i;
+
+	for (i = 0; i < alist->n_attr; i++)
+		if (strcmp(name, alist->attr[i].name) == 0)
+			return &alist->attr[i];
+
+	return NULL;
+}
+
+static void swconfig_defaults_init(struct switch_dev *dev)
+{
+	const struct switch_dev_ops *ops = dev->ops;
+
+	dev->def_global = 0;
+	dev->def_vlan = 0;
+	dev->def_port = 0;
+
+	if (ops->get_vlan_ports || ops->set_vlan_ports)
+		set_bit(VLAN_PORTS, &dev->def_vlan);
+
+	if (ops->get_port_pvid || ops->set_port_pvid)
+		set_bit(PORT_PVID, &dev->def_port);
+
+	if (ops->get_port_link &&
+	    !swconfig_find_attr_by_name(&ops->attr_port, "link"))
+		set_bit(PORT_LINK, &dev->def_port);
+
+	/* always present, can be no-op */
+	set_bit(GLOBAL_APPLY, &dev->def_global);
+	set_bit(GLOBAL_RESET, &dev->def_global);
+}
+
+
+static struct genl_family switch_fam = {
+	.id = GENL_ID_GENERATE,
+	.name = "switch",
+	.hdrsize = 0,
+	.version = 1,
+	.maxattr = SWITCH_ATTR_MAX,
+};
+
+static const struct nla_policy switch_policy[SWITCH_ATTR_MAX+1] = {
+	[SWITCH_ATTR_ID] = { .type = NLA_U32 },
+	[SWITCH_ATTR_OP_ID] = { .type = NLA_U32 },
+	[SWITCH_ATTR_OP_PORT] = { .type = NLA_U32 },
+	[SWITCH_ATTR_OP_VLAN] = { .type = NLA_U32 },
+	[SWITCH_ATTR_OP_VALUE_INT] = { .type = NLA_U32 },
+	[SWITCH_ATTR_OP_VALUE_STR] = { .type = NLA_NUL_STRING },
+	[SWITCH_ATTR_OP_VALUE_PORTS] = { .type = NLA_NESTED },
+	[SWITCH_ATTR_TYPE] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy port_policy[SWITCH_PORT_ATTR_MAX+1] = {
+	[SWITCH_PORT_ID] = { .type = NLA_U32 },
+	[SWITCH_PORT_FLAG_TAGGED] = { .type = NLA_FLAG },
+};
+
+static inline void
+swconfig_lock(void)
+{
+	spin_lock(&swdevs_lock);
+}
+
+static inline void
+swconfig_unlock(void)
+{
+	spin_unlock(&swdevs_lock);
+}
+
+static struct switch_dev *
+swconfig_get_dev(struct genl_info *info)
+{
+	struct switch_dev *dev = NULL;
+	struct switch_dev *p;
+	int id;
+
+	if (!info->attrs[SWITCH_ATTR_ID])
+		goto done;
+
+	id = nla_get_u32(info->attrs[SWITCH_ATTR_ID]);
+	swconfig_lock();
+	list_for_each_entry(p, &swdevs, dev_list) {
+		if (id != p->id)
+			continue;
+
+		dev = p;
+		break;
+	}
+	if (dev)
+		mutex_lock(&dev->sw_mutex);
+	else
+		pr_debug("device %d not found\n", id);
+	swconfig_unlock();
+done:
+	return dev;
+}
+
+static inline void
+swconfig_put_dev(struct switch_dev *dev)
+{
+	mutex_unlock(&dev->sw_mutex);
+}
+
+static int
+swconfig_dump_attr(struct swconfig_callback *cb, void *arg)
+{
+	struct switch_attr *op = arg;
+	struct genl_info *info = cb->info;
+	struct sk_buff *msg = cb->msg;
+	int id = cb->args[0];
+	void *hdr;
+
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &switch_fam,
+			NLM_F_MULTI, SWITCH_CMD_NEW_ATTR);
+	if (IS_ERR(hdr))
+		return -1;
+
+	if (nla_put_u32(msg, SWITCH_ATTR_OP_ID, id))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, SWITCH_ATTR_OP_TYPE, op->type))
+		goto nla_put_failure;
+	if (nla_put_string(msg, SWITCH_ATTR_OP_NAME, op->name))
+		goto nla_put_failure;
+	if (op->description)
+		if (nla_put_string(msg, SWITCH_ATTR_OP_DESCRIPTION,
+			op->description))
+			goto nla_put_failure;
+
+	return genlmsg_end(msg, hdr);
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+/* spread multipart messages across multiple message buffers */
+static int
+swconfig_send_multipart(struct swconfig_callback *cb, void *arg)
+{
+	struct genl_info *info = cb->info;
+	int restart = 0;
+	int err;
+
+	do {
+		if (!cb->msg) {
+			cb->msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+			if (cb->msg == NULL)
+				goto error;
+		}
+
+		if (!(cb->fill(cb, arg) < 0))
+			break;
+
+		/* fill failed, check if this was already the second attempt */
+		if (restart)
+			goto error;
+
+		/* try again in a new message, send the current one */
+		restart = 1;
+		if (cb->close) {
+			if (cb->close(cb, arg) < 0)
+				goto error;
+		}
+		err = genlmsg_reply(cb->msg, info);
+		cb->msg = NULL;
+		if (err < 0)
+			goto error;
+
+	} while (restart);
+
+	return 0;
+
+error:
+	if (cb->msg)
+		nlmsg_free(cb->msg);
+	return -1;
+}
+
+static int
+swconfig_list_attrs(struct sk_buff *skb, struct genl_info *info)
+{
+	struct genlmsghdr *hdr = nlmsg_data(info->nlhdr);
+	const struct switch_attrlist *alist;
+	struct switch_dev *dev;
+	struct swconfig_callback cb;
+	int err = -EINVAL;
+	int i;
+
+	/* defaults */
+	struct switch_attr *def_list;
+	unsigned long *def_active;
+	int n_def;
+
+	dev = swconfig_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	switch (hdr->cmd) {
+	case SWITCH_CMD_LIST_GLOBAL:
+		alist = &dev->ops->attr_global;
+		def_list = default_global;
+		def_active = &dev->def_global;
+		n_def = ARRAY_SIZE(default_global);
+		break;
+	case SWITCH_CMD_LIST_VLAN:
+		alist = &dev->ops->attr_vlan;
+		def_list = default_vlan;
+		def_active = &dev->def_vlan;
+		n_def = ARRAY_SIZE(default_vlan);
+		break;
+	case SWITCH_CMD_LIST_PORT:
+		alist = &dev->ops->attr_port;
+		def_list = default_port;
+		def_active = &dev->def_port;
+		n_def = ARRAY_SIZE(default_port);
+		break;
+	default:
+		WARN_ON(1);
+		goto out;
+	}
+
+	memset(&cb, 0, sizeof(cb));
+	cb.info = info;
+	cb.fill = swconfig_dump_attr;
+	for (i = 0; i < alist->n_attr; i++) {
+		if (alist->attr[i].disabled)
+			continue;
+		cb.args[0] = i;
+		err = swconfig_send_multipart(&cb, (void *) &alist->attr[i]);
+		if (err < 0)
+			goto error;
+	}
+
+	/* defaults */
+	for (i = 0; i < n_def; i++) {
+		if (!test_bit(i, def_active))
+			continue;
+		cb.args[0] = SWITCH_ATTR_DEFAULTS_OFFSET + i;
+		err = swconfig_send_multipart(&cb, (void *) &def_list[i]);
+		if (err < 0)
+			goto error;
+	}
+	swconfig_put_dev(dev);
+
+	if (!cb.msg)
+		return 0;
+
+	return genlmsg_reply(cb.msg, info);
+
+error:
+	if (cb.msg)
+		nlmsg_free(cb.msg);
+out:
+	swconfig_put_dev(dev);
+	return err;
+}
+
+static const struct switch_attr *
+swconfig_lookup_attr(struct switch_dev *dev, struct genl_info *info,
+		struct switch_val *val)
+{
+	struct genlmsghdr *hdr = nlmsg_data(info->nlhdr);
+	const struct switch_attrlist *alist;
+	const struct switch_attr *attr = NULL;
+	int attr_id;
+
+	/* defaults */
+	struct switch_attr *def_list;
+	unsigned long *def_active;
+	int n_def;
+
+	if (!info->attrs[SWITCH_ATTR_OP_ID])
+		goto done;
+
+	switch (hdr->cmd) {
+	case SWITCH_CMD_SET_GLOBAL:
+	case SWITCH_CMD_GET_GLOBAL:
+		alist = &dev->ops->attr_global;
+		def_list = default_global;
+		def_active = &dev->def_global;
+		n_def = ARRAY_SIZE(default_global);
+		break;
+	case SWITCH_CMD_SET_VLAN:
+	case SWITCH_CMD_GET_VLAN:
+		alist = &dev->ops->attr_vlan;
+		def_list = default_vlan;
+		def_active = &dev->def_vlan;
+		n_def = ARRAY_SIZE(default_vlan);
+		if (!info->attrs[SWITCH_ATTR_OP_VLAN])
+			goto done;
+		val->port_vlan = nla_get_u32(info->attrs[SWITCH_ATTR_OP_VLAN]);
+		if (val->port_vlan >= dev->vlans)
+			goto done;
+		break;
+	case SWITCH_CMD_SET_PORT:
+	case SWITCH_CMD_GET_PORT:
+		alist = &dev->ops->attr_port;
+		def_list = default_port;
+		def_active = &dev->def_port;
+		n_def = ARRAY_SIZE(default_port);
+		if (!info->attrs[SWITCH_ATTR_OP_PORT])
+			goto done;
+		val->port_vlan = nla_get_u32(info->attrs[SWITCH_ATTR_OP_PORT]);
+		if (val->port_vlan >= dev->ports)
+			goto done;
+		break;
+	default:
+		WARN_ON(1);
+		goto done;
+	}
+
+	if (!alist)
+		goto done;
+
+	attr_id = nla_get_u32(info->attrs[SWITCH_ATTR_OP_ID]);
+	if (attr_id >= SWITCH_ATTR_DEFAULTS_OFFSET) {
+		attr_id -= SWITCH_ATTR_DEFAULTS_OFFSET;
+		if (attr_id >= n_def)
+			goto done;
+		if (!test_bit(attr_id, def_active))
+			goto done;
+		attr = &def_list[attr_id];
+	} else {
+		if (attr_id >= alist->n_attr)
+			goto done;
+		attr = &alist->attr[attr_id];
+	}
+
+	if (attr->disabled)
+		attr = NULL;
+
+done:
+	if (!attr)
+		pr_debug("attribute lookup failed\n");
+	val->attr = attr;
+	return attr;
+}
+
+static int
+swconfig_parse_ports(struct sk_buff *msg, struct nlattr *head,
+		struct switch_val *val, int max)
+{
+	struct nlattr *nla;
+	int rem;
+
+	val->len = 0;
+	nla_for_each_nested(nla, head, rem) {
+		struct nlattr *tb[SWITCH_PORT_ATTR_MAX+1];
+		struct switch_port *port = &val->value.ports[val->len];
+
+		if (val->len >= max)
+			return -EINVAL;
+
+		if (nla_parse_nested(tb, SWITCH_PORT_ATTR_MAX, nla,
+				port_policy))
+			return -EINVAL;
+
+		if (!tb[SWITCH_PORT_ID])
+			return -EINVAL;
+
+		port->id = nla_get_u32(tb[SWITCH_PORT_ID]);
+		if (tb[SWITCH_PORT_FLAG_TAGGED])
+			port->flags |= (1 << SWITCH_PORT_FLAG_TAGGED);
+		val->len++;
+	}
+
+	return 0;
+}
+
+static int
+swconfig_set_attr(struct sk_buff *skb, struct genl_info *info)
+{
+	const struct switch_attr *attr;
+	struct switch_dev *dev;
+	struct switch_val val;
+	int err = -EINVAL;
+
+	dev = swconfig_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	memset(&val, 0, sizeof(val));
+	attr = swconfig_lookup_attr(dev, info, &val);
+	if (!attr || !attr->set)
+		goto error;
+
+	val.attr = attr;
+	switch (attr->type) {
+	case SWITCH_TYPE_NOVAL:
+		break;
+	case SWITCH_TYPE_INT:
+		if (!info->attrs[SWITCH_ATTR_OP_VALUE_INT])
+			goto error;
+		val.value.i =
+			nla_get_u32(info->attrs[SWITCH_ATTR_OP_VALUE_INT]);
+		break;
+	case SWITCH_TYPE_STRING:
+		if (!info->attrs[SWITCH_ATTR_OP_VALUE_STR])
+			goto error;
+		val.value.s =
+			nla_data(info->attrs[SWITCH_ATTR_OP_VALUE_STR]);
+		break;
+	case SWITCH_TYPE_PORTS:
+		val.value.ports = dev->portbuf;
+		memset(dev->portbuf, 0,
+			sizeof(struct switch_port) * dev->ports);
+
+		/* TODO: implement multipart? */
+		if (info->attrs[SWITCH_ATTR_OP_VALUE_PORTS]) {
+			err = swconfig_parse_ports(skb,
+				info->attrs[SWITCH_ATTR_OP_VALUE_PORTS],
+				&val, dev->ports);
+			if (err < 0)
+				goto error;
+		} else {
+			val.len = 0;
+			err = 0;
+		}
+		break;
+	default:
+		goto error;
+	}
+
+	err = attr->set(dev, attr, &val);
+error:
+	swconfig_put_dev(dev);
+	return err;
+}
+
+static int
+swconfig_close_portlist(struct swconfig_callback *cb, void *arg)
+{
+	if (cb->nest[0])
+		nla_nest_end(cb->msg, cb->nest[0]);
+	return 0;
+}
+
+static int
+swconfig_send_port(struct swconfig_callback *cb, void *arg)
+{
+	const struct switch_port *port = arg;
+	struct nlattr *p = NULL;
+
+	if (!cb->nest[0]) {
+		cb->nest[0] = nla_nest_start(cb->msg, cb->cmd);
+		if (!cb->nest[0])
+			return -1;
+	}
+
+	p = nla_nest_start(cb->msg, SWITCH_ATTR_PORT);
+	if (!p)
+		goto error;
+
+	if (nla_put_u32(cb->msg, SWITCH_PORT_ID, port->id))
+		goto nla_put_failure;
+	if (port->flags & (1 << SWITCH_PORT_FLAG_TAGGED)) {
+		if (nla_put_flag(cb->msg, SWITCH_PORT_FLAG_TAGGED))
+			goto nla_put_failure;
+	}
+
+	nla_nest_end(cb->msg, p);
+	return 0;
+
+nla_put_failure:
+		nla_nest_cancel(cb->msg, p);
+error:
+	nla_nest_cancel(cb->msg, cb->nest[0]);
+	return -1;
+}
+
+static int
+swconfig_send_ports(struct sk_buff **msg, struct genl_info *info, int attr,
+		const struct switch_val *val)
+{
+	struct swconfig_callback cb;
+	int err = 0;
+	int i;
+
+	if (!val->value.ports)
+		return -EINVAL;
+
+	memset(&cb, 0, sizeof(cb));
+	cb.cmd = attr;
+	cb.msg = *msg;
+	cb.info = info;
+	cb.fill = swconfig_send_port;
+	cb.close = swconfig_close_portlist;
+
+	cb.nest[0] = nla_nest_start(cb.msg, cb.cmd);
+	for (i = 0; i < val->len; i++) {
+		err = swconfig_send_multipart(&cb, &val->value.ports[i]);
+		if (err)
+			goto done;
+	}
+	err = val->len;
+	swconfig_close_portlist(&cb, NULL);
+	*msg = cb.msg;
+
+done:
+	return err;
+}
+
+static int
+swconfig_get_attr(struct sk_buff *skb, struct genl_info *info)
+{
+	struct genlmsghdr *hdr = nlmsg_data(info->nlhdr);
+	const struct switch_attr *attr;
+	struct switch_dev *dev;
+	struct sk_buff *msg = NULL;
+	struct switch_val val;
+	int err = -EINVAL;
+	int cmd = hdr->cmd;
+
+	dev = swconfig_get_dev(info);
+	if (!dev)
+		return -EINVAL;
+
+	memset(&val, 0, sizeof(val));
+	attr = swconfig_lookup_attr(dev, info, &val);
+	if (!attr || !attr->get)
+		goto error;
+
+	if (attr->type == SWITCH_TYPE_PORTS) {
+		val.value.ports = dev->portbuf;
+		memset(dev->portbuf, 0,
+			sizeof(struct switch_port) * dev->ports);
+	}
+
+	err = attr->get(dev, attr, &val);
+	if (err)
+		goto error;
+
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg)
+		goto error;
+
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &switch_fam,
+			0, cmd);
+	if (IS_ERR(hdr))
+		goto nla_put_failure;
+
+	switch (attr->type) {
+	case SWITCH_TYPE_INT:
+		if (nla_put_u32(msg, SWITCH_ATTR_OP_VALUE_INT, val.value.i))
+			goto nla_put_failure;
+		break;
+	case SWITCH_TYPE_STRING:
+		if (nla_put_string(msg, SWITCH_ATTR_OP_VALUE_STR, val.value.s))
+			goto nla_put_failure;
+		break;
+	case SWITCH_TYPE_PORTS:
+		err = swconfig_send_ports(&msg, info,
+				SWITCH_ATTR_OP_VALUE_PORTS, &val);
+		if (err < 0)
+			goto nla_put_failure;
+		break;
+	default:
+		pr_debug("invalid type in attribute\n");
+		err = -EINVAL;
+		goto error;
+	}
+	err = genlmsg_end(msg, hdr);
+	if (err < 0)
+		goto nla_put_failure;
+
+	swconfig_put_dev(dev);
+	return genlmsg_reply(msg, info);
+
+nla_put_failure:
+	if (msg)
+		nlmsg_free(msg);
+error:
+	swconfig_put_dev(dev);
+	if (!err)
+		err = -ENOMEM;
+	return err;
+}
+
+static int
+swconfig_send_switch(struct sk_buff *msg, u32 pid, u32 seq, int flags,
+		const struct switch_dev *dev)
+{
+	struct nlattr *p = NULL, *m = NULL;
+	void *hdr;
+	int i;
+
+	hdr = genlmsg_put(msg, pid, seq, &switch_fam, flags,
+			SWITCH_CMD_NEW_ATTR);
+	if (IS_ERR(hdr))
+		return -1;
+
+	if (nla_put_u32(msg, SWITCH_ATTR_ID, dev->id))
+		goto nla_put_failure;
+	if (nla_put_string(msg, SWITCH_ATTR_DEV_NAME, dev->devname))
+		goto nla_put_failure;
+	if (nla_put_string(msg, SWITCH_ATTR_ALIAS, dev->alias))
+		goto nla_put_failure;
+	if (nla_put_string(msg, SWITCH_ATTR_NAME, dev->name))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, SWITCH_ATTR_VLANS, dev->vlans))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, SWITCH_ATTR_PORTS, dev->ports))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, SWITCH_ATTR_CPU_PORT, dev->cpu_port))
+		goto nla_put_failure;
+
+	m = nla_nest_start(msg, SWITCH_ATTR_PORTMAP);
+	if (!m)
+		goto nla_put_failure;
+	for (i = 0; i < dev->ports; i++) {
+		p = nla_nest_start(msg, SWITCH_ATTR_PORTS);
+		if (!p)
+			continue;
+		if (dev->portmap[i].s) {
+			if (nla_put_string(msg, SWITCH_PORTMAP_SEGMENT,
+						dev->portmap[i].s))
+				goto nla_put_failure;
+			if (nla_put_u32(msg, SWITCH_PORTMAP_VIRT,
+						dev->portmap[i].virt))
+				goto nla_put_failure;
+		}
+		nla_nest_end(msg, p);
+	}
+	nla_nest_end(msg, m);
+	return genlmsg_end(msg, hdr);
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int swconfig_dump_switches(struct sk_buff *skb,
+		struct netlink_callback *cb)
+{
+	struct switch_dev *dev;
+	int start = cb->args[0];
+	int idx = 0;
+
+	swconfig_lock();
+	list_for_each_entry(dev, &swdevs, dev_list) {
+		if (++idx <= start)
+			continue;
+		if (swconfig_send_switch(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				dev) < 0)
+			break;
+	}
+	swconfig_unlock();
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+static int
+swconfig_done(struct netlink_callback *cb)
+{
+	return 0;
+}
+
+static struct genl_ops swconfig_ops[] = {
+	{
+		.cmd = SWITCH_CMD_LIST_GLOBAL,
+		.doit = swconfig_list_attrs,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_LIST_VLAN,
+		.doit = swconfig_list_attrs,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_LIST_PORT,
+		.doit = swconfig_list_attrs,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_GET_GLOBAL,
+		.doit = swconfig_get_attr,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_GET_VLAN,
+		.doit = swconfig_get_attr,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_GET_PORT,
+		.doit = swconfig_get_attr,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_SET_GLOBAL,
+		.doit = swconfig_set_attr,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_SET_VLAN,
+		.doit = swconfig_set_attr,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_SET_PORT,
+		.doit = swconfig_set_attr,
+		.policy = switch_policy,
+	},
+	{
+		.cmd = SWITCH_CMD_GET_SWITCH,
+		.dumpit = swconfig_dump_switches,
+		.policy = switch_policy,
+		.done = swconfig_done,
+	}
+};
+
+int
+register_switch(struct switch_dev *dev, struct net_device *netdev)
+{
+	struct switch_dev *sdev;
+	const int max_switches = 8 * sizeof(unsigned long);
+	unsigned long in_use = 0;
+	int i;
+
+	INIT_LIST_HEAD(&dev->dev_list);
+	if (netdev) {
+		dev->netdev = netdev;
+		if (!dev->alias)
+			dev->alias = netdev->name;
+	}
+	BUG_ON(!dev->alias);
+
+	if (dev->ports > 0) {
+		dev->portbuf = kzalloc(sizeof(struct switch_port) *
+				dev->ports, GFP_KERNEL);
+		if (!dev->portbuf)
+			return -ENOMEM;
+		dev->portmap = kzalloc(sizeof(struct switch_portmap) *
+				dev->ports, GFP_KERNEL);
+		if (!dev->portmap) {
+			kfree(dev->portbuf);
+			return -ENOMEM;
+		}
+	}
+	swconfig_defaults_init(dev);
+	mutex_init(&dev->sw_mutex);
+	swconfig_lock();
+	dev->id = ++swdev_id;
+
+	list_for_each_entry(sdev, &swdevs, dev_list) {
+		if (!sscanf(sdev->devname, SWCONFIG_DEVNAME, &i))
+			continue;
+		if (i < 0 || i > max_switches)
+			continue;
+
+		set_bit(i, &in_use);
+	}
+	i = find_first_zero_bit(&in_use, max_switches);
+
+	if (i == max_switches) {
+		swconfig_unlock();
+		return -ENFILE;
+	}
+
+	/* fill device name */
+	snprintf(dev->devname, IFNAMSIZ, SWCONFIG_DEVNAME, i);
+
+	list_add(&dev->dev_list, &swdevs);
+	swconfig_unlock();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_switch);
+
+void
+unregister_switch(struct switch_dev *dev)
+{
+	kfree(dev->portbuf);
+	mutex_lock(&dev->sw_mutex);
+	swconfig_lock();
+	list_del(&dev->dev_list);
+	swconfig_unlock();
+	mutex_unlock(&dev->sw_mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_switch);
+
+
+static int __init
+swconfig_init(void)
+{
+	int i, err;
+
+	INIT_LIST_HEAD(&swdevs);
+	err = genl_register_family(&switch_fam);
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(swconfig_ops); i++) {
+		err = genl_register_ops(&switch_fam, &swconfig_ops[i]);
+		if (err)
+			goto unregister;
+	}
+
+	return 0;
+
+unregister:
+	genl_unregister_family(&switch_fam);
+	return err;
+}
+
+static void __exit
+swconfig_exit(void)
+{
+	genl_unregister_family(&switch_fam);
+}
+
+module_init(swconfig_init);
+module_exit(swconfig_exit);
+
diff --git a/include/linux/swconfig.h b/include/linux/swconfig.h
new file mode 100644
index 0000000..fd96eec
--- /dev/null
+++ b/include/linux/swconfig.h
@@ -0,0 +1,180 @@ 
+/*
+ * Switch configuration API
+ *
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _LINUX_SWITCH_H
+#define _LINUX_SWITCH_H
+
+#include <net/genetlink.h>
+#include <uapi/linux/swconfig.h>
+
+struct switch_dev;
+struct switch_op;
+struct switch_val;
+struct switch_attr;
+struct switch_attrlist;
+struct switch_led_trigger;
+
+int register_switch(struct switch_dev *dev, struct net_device *netdev);
+void unregister_switch(struct switch_dev *dev);
+
+/**
+ * struct switch_attrlist - attribute list
+ *
+ * @n_attr: number of attributes
+ * @attr: pointer to the attributes array
+ */
+struct switch_attrlist {
+	int n_attr;
+	const struct switch_attr *attr;
+};
+
+enum switch_port_speed {
+	SWITCH_PORT_SPEED_UNKNOWN = 0,
+	SWITCH_PORT_SPEED_10 = 10,
+	SWITCH_PORT_SPEED_100 = 100,
+	SWITCH_PORT_SPEED_1000 = 1000,
+};
+
+struct switch_port_link {
+	bool link;
+	bool duplex;
+	bool aneg;
+	bool tx_flow;
+	bool rx_flow;
+	enum switch_port_speed speed;
+};
+
+struct switch_port_stats {
+	unsigned long tx_bytes;
+	unsigned long rx_bytes;
+};
+
+/**
+ * struct switch_dev_ops - switch driver operations
+ *
+ * @attr_global: global switch attribute list
+ * @attr_port: port attribute list
+ * @attr_vlan: vlan attribute list
+ *
+ * Callbacks:
+ *
+ * @get_vlan_ports: read the port list of a VLAN
+ * @set_vlan_ports: set the port list of a VLAN
+ *
+ * @get_port_pvid: get the primary VLAN ID of a port
+ * @set_port_pvid: set the primary VLAN ID of a port
+ *
+ * @apply_config: apply all changed settings to the switch
+ * @reset_switch: resetting the switch
+ *
+ * @get_port_link: read the port link status
+ * @get_port_stats: read the port statistics counters
+ */
+struct switch_dev_ops {
+	struct switch_attrlist attr_global, attr_port, attr_vlan;
+
+	int (*get_vlan_ports)(struct switch_dev *dev, struct switch_val *val);
+	int (*set_vlan_ports)(struct switch_dev *dev, struct switch_val *val);
+
+	int (*get_port_pvid)(struct switch_dev *dev, int port, int *val);
+	int (*set_port_pvid)(struct switch_dev *dev, int port, int val);
+
+	int (*apply_config)(struct switch_dev *dev);
+	int (*reset_switch)(struct switch_dev *dev);
+
+	int (*get_port_link)(struct switch_dev *dev, int port,
+			     struct switch_port_link *link);
+	int (*get_port_stats)(struct switch_dev *dev, int port,
+			      struct switch_port_stats *stats);
+};
+
+/**
+ * struct switch_dev - switch device
+ *
+ * @ops: switch driver operations pointer
+ * @devname: switch device name (automatically filled)
+ * @name: switch driver name returned to user-space
+ * @alias: alias name for the switch (instead of ethX) returned to user-space
+ * @netdev: network device pointer if alias is not used
+ *
+ * @ports: number of physical switch ports
+ * @vlans: number of supported VLANs
+ * @cpu_port: identifier for the CPU port
+ */
+struct switch_dev {
+	const struct switch_dev_ops *ops;
+	/* will be automatically filled */
+	char devname[IFNAMSIZ];
+
+	const char *name;
+	/* NB: either alias or netdev must be set */
+	const char *alias;
+	struct net_device *netdev;
+
+	int ports;
+	int vlans;
+	int cpu_port;
+
+	/* the following fields are internal for swconfig */
+	int id;
+	struct list_head dev_list;
+	unsigned long def_global, def_port, def_vlan;
+
+	struct mutex sw_mutex;
+	struct switch_port *portbuf;
+	struct switch_portmap *portmap;
+
+	char buf[128];
+};
+
+struct switch_port {
+	u32 id;
+	u32 flags;
+};
+
+struct switch_portmap {
+	u32 virt;
+	const char *s;
+};
+
+struct switch_val {
+	const struct switch_attr *attr;
+	int port_vlan;
+	int len;
+	union {
+		const char *s;
+		u32 i;
+		struct switch_port *ports;
+	} value;
+};
+
+struct switch_attr {
+	int disabled;
+	int type;
+	const char *name;
+	const char *description;
+
+	int (*set)(struct switch_dev *dev, const struct switch_attr *attr,
+			struct switch_val *val);
+	int (*get)(struct switch_dev *dev, const struct switch_attr *attr,
+			struct switch_val *val);
+
+	/* for driver internal use */
+	int id;
+	int ofs;
+	int max;
+};
+
+#endif /* _LINUX_SWITCH_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 115add2..0a995be 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -363,6 +363,7 @@  header-y += stddef.h
 header-y += string.h
 header-y += suspend_ioctls.h
 header-y += swab.h
+header-y += swconfig.h
 header-y += synclink.h
 header-y += sysctl.h
 header-y += sysinfo.h
diff --git a/include/uapi/linux/swconfig.h b/include/uapi/linux/swconfig.h
new file mode 100644
index 0000000..17cf178
--- /dev/null
+++ b/include/uapi/linux/swconfig.h
@@ -0,0 +1,103 @@ 
+/*
+ * Switch configuration API
+ *
+ * Copyright (C) 2008 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_LINUX_SWITCH_H
+#define _UAPI_LINUX_SWITCH_H
+
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/genetlink.h>
+#ifndef __KERNEL__
+#include <netlink/netlink.h>
+#include <netlink/genl/genl.h>
+#include <netlink/genl/ctrl.h>
+#endif
+
+/* main attributes */
+enum {
+	SWITCH_ATTR_UNSPEC,
+	/* global */
+	SWITCH_ATTR_TYPE,
+	/* device */
+	SWITCH_ATTR_ID,
+	SWITCH_ATTR_DEV_NAME,
+	SWITCH_ATTR_ALIAS,
+	SWITCH_ATTR_NAME,
+	SWITCH_ATTR_VLANS,
+	SWITCH_ATTR_PORTS,
+	SWITCH_ATTR_PORTMAP,
+	SWITCH_ATTR_CPU_PORT,
+	/* attributes */
+	SWITCH_ATTR_OP_ID,
+	SWITCH_ATTR_OP_TYPE,
+	SWITCH_ATTR_OP_NAME,
+	SWITCH_ATTR_OP_PORT,
+	SWITCH_ATTR_OP_VLAN,
+	SWITCH_ATTR_OP_VALUE_INT,
+	SWITCH_ATTR_OP_VALUE_STR,
+	SWITCH_ATTR_OP_VALUE_PORTS,
+	SWITCH_ATTR_OP_DESCRIPTION,
+	/* port lists */
+	SWITCH_ATTR_PORT,
+	SWITCH_ATTR_MAX
+};
+
+enum {
+	/* port map */
+	SWITCH_PORTMAP_PORTS,
+	SWITCH_PORTMAP_SEGMENT,
+	SWITCH_PORTMAP_VIRT,
+	SWITCH_PORTMAP_MAX
+};
+
+/* commands */
+enum {
+	SWITCH_CMD_UNSPEC,
+	SWITCH_CMD_GET_SWITCH,
+	SWITCH_CMD_NEW_ATTR,
+	SWITCH_CMD_LIST_GLOBAL,
+	SWITCH_CMD_GET_GLOBAL,
+	SWITCH_CMD_SET_GLOBAL,
+	SWITCH_CMD_LIST_PORT,
+	SWITCH_CMD_GET_PORT,
+	SWITCH_CMD_SET_PORT,
+	SWITCH_CMD_LIST_VLAN,
+	SWITCH_CMD_GET_VLAN,
+	SWITCH_CMD_SET_VLAN
+};
+
+/* data types */
+enum switch_val_type {
+	SWITCH_TYPE_UNSPEC,
+	SWITCH_TYPE_INT,
+	SWITCH_TYPE_STRING,
+	SWITCH_TYPE_PORTS,
+	SWITCH_TYPE_NOVAL,
+};
+
+/* port nested attributes */
+enum {
+	SWITCH_PORT_UNSPEC,
+	SWITCH_PORT_ID,
+	SWITCH_PORT_FLAG_TAGGED,
+	SWITCH_PORT_ATTR_MAX
+};
+
+#define SWITCH_ATTR_DEFAULTS_OFFSET	0x1000
+
+
+#endif /* _UAPI_LINUX_SWITCH_H */