diff mbox

[net-next-2.6,1/2] Add ndo_set_vf_port_profile (was iovnl)

Message ID 20100424003540.12745.81403.stgit@savbu-pc100.cisco.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Scott Feldman April 24, 2010, 12:35 a.m. UTC
From: Scott Feldman <scofeldm@cisco.com>

(This is take #2 on the iovnl patches posted earlier based on feedback from
Chris Wright, Arnd Bergmann, and others.  Thanks guys!)

Add new netdev ops ndo_set_vf_port_profile to allow setting of port-profile
on VF, along the lines of existing nds_set_vf_* ops.  Extends RTM_SETLINK
with new sub cmd called IFLA_VF_PORT_PROFILE (added to end on cmd list).  The
port-profile cmd arguments are (as seen from iproute2 cmdline):

       ip link set DEVICE [ { up | down } ]
                          ...
                          [ vf NUM [ mac LLADDR ]
                                   [ vlan VLANID [ qos VLAN-QOS ] ]
                                   [ rate TXRATE ] ] 
                                   [ port_profile [ PORT-PROFILE
                                           [ mac LLADDR ]
                                           [ host_uuid HOST_UUID ]
                                           [ client_uuid CLIENT_UUID ]
                                           [ client_name CLIENT_NAME ] ] ] ]


I took some liberties and s/SR-IOV/IOV in the code comments around the
ndo_set_vf_* cmds as they can apply to both SR-IOV and non-SR-IOV adapters,
as long as there is a PF:VF parent:child relationship.

A port-profile is used to configure/enable the network port backing the VF, not
to configure the host-facing side of the VF.

Signed-off-by: Scott Feldman <scofeldm@cisco.com>
Signed-off-by: Roopa Prabhu<roprabhu@cisco.com>
---
 include/linux/if_link.h   |   15 +++++++++++++--
 include/linux/netdevice.h |   11 ++++++++++-
 net/core/rtnetlink.c      |   20 ++++++++++++++++++++
 3 files changed, 43 insertions(+), 3 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Chris Wright April 24, 2010, 2:22 a.m. UTC | #1
* Scott Feldman (scofeldm@cisco.com) wrote:
> From: Scott Feldman <scofeldm@cisco.com>
> 
> (This is take #2 on the iovnl patches posted earlier based on feedback from
> Chris Wright, Arnd Bergmann, and others.  Thanks guys!)
> 
> Add new netdev ops ndo_set_vf_port_profile to allow setting of port-profile
> on VF, along the lines of existing nds_set_vf_* ops.  Extends RTM_SETLINK
> with new sub cmd called IFLA_VF_PORT_PROFILE (added to end on cmd list).  The
> port-profile cmd arguments are (as seen from iproute2 cmdline):
> 
>        ip link set DEVICE [ { up | down } ]
>                           ...
>                           [ vf NUM [ mac LLADDR ]
>                                    [ vlan VLANID [ qos VLAN-QOS ] ]
>                                    [ rate TXRATE ] ] 
>                                    [ port_profile [ PORT-PROFILE
>                                            [ mac LLADDR ]
>                                            [ host_uuid HOST_UUID ]
>                                            [ client_uuid CLIENT_UUID ]
>                                            [ client_name CLIENT_NAME ] ] ] ]
> 
> 
> I took some liberties and s/SR-IOV/IOV in the code comments around the
> ndo_set_vf_* cmds as they can apply to both SR-IOV and non-SR-IOV adapters,
> as long as there is a PF:VF parent:child relationship.

For enic case, which do you expect to use for net_dev and VF index?  Would
this be VF + index== 0 (meaning the degenerate case you described last
time where PF==VF)?

> A port-profile is used to configure/enable the network port backing the VF, not
> to configure the host-facing side of the VF.

How shall we do the lldpad case?

> Signed-off-by: Scott Feldman <scofeldm@cisco.com>
> Signed-off-by: Roopa Prabhu<roprabhu@cisco.com>
> ---
>  include/linux/if_link.h   |   15 +++++++++++++--
>  include/linux/netdevice.h |   11 ++++++++++-
>  net/core/rtnetlink.c      |   20 ++++++++++++++++++++
>  3 files changed, 43 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/if_link.h b/include/linux/if_link.h
> index cfd420b..2c5cc65 100644
> --- a/include/linux/if_link.h
> +++ b/include/linux/if_link.h
> @@ -110,12 +110,13 @@ enum {
>  #define IFLA_LINKINFO IFLA_LINKINFO
>  	IFLA_NET_NS_PID,
>  	IFLA_IFALIAS,
> -	IFLA_NUM_VF,		/* Number of VFs if device is SR-IOV PF */
> +	IFLA_NUM_VF,		/* Number of VFs if device is IOV PF */
>  	IFLA_VF_MAC,		/* Hardware queue specific attributes */
>  	IFLA_VF_VLAN,
>  	IFLA_VF_TX_RATE,	/* TX Bandwidth Allocation */
>  	IFLA_VFINFO,
>  	IFLA_STATS64,
> +	IFLA_VF_PORT_PROFILE,
>  	__IFLA_MAX
>  };
>  
> @@ -234,7 +235,7 @@ enum macvlan_mode {
>  	MACVLAN_MODE_BRIDGE  = 4, /* talk to bridge ports directly */
>  };
>  
> -/* SR-IOV virtual function managment section */
> +/* IOV virtual function managment section */
>  
>  struct ifla_vf_mac {
>  	__u32 vf;
> @@ -259,4 +260,14 @@ struct ifla_vf_info {
>  	__u32 qos;
>  	__u32 tx_rate;
>  };
> +
> +struct ifla_vf_port_profile {
> +	__u32 vf;
> +	__u8 port_profile[64];
> +	__u8 mac[32];
> +	__u8 host_uuid[64]; /* e.g. "CEEFD3B1-9E11-11DE-BDFD-000BAB01C0FB" */
> +	__u8 client_uuid[64];
> +	__u8 client_name[64]; /* e.g. "vm0-eth1" */
> +};
> +
>  #endif /* _LINUX_IF_LINK_H */
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3c5ed5f..26dd4cb 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -690,10 +690,13 @@ struct netdev_rx_queue {
>   *
>   * void (*ndo_poll_controller)(struct net_device *dev);
>   *
> - *	SR-IOV management functions.
> + *	IOV management functions.
>   * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
>   * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
>   * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate);
> + * int (*ndo_set_vf_port_profile)(struct net_device *dev, int vf,
> + *				  u8 *port_profile, u8 *mac, u8 *host_uuid,
> + *				  u8 *client_uuid, u8 *client_name);
>   * int (*ndo_get_vf_config)(struct net_device *dev,
>   *			    int vf, struct ifla_vf_info *ivf);
>   */
> @@ -741,6 +744,12 @@ struct net_device_ops {
>  						   int queue, u16 vlan, u8 qos);
>  	int			(*ndo_set_vf_tx_rate)(struct net_device *dev,
>  						      int vf, int rate);
> +	int			(*ndo_set_vf_port_profile)(
> +					struct net_device *dev, int vf,
> +					u8 *port_profile, u8 *mac,
> +					u8 *host_uuid,
> +					u8 *client_uuid,
> +					u8 *client_name);
>  	int			(*ndo_get_vf_config)(struct net_device *dev,
>  						     int vf,
>  						     struct ifla_vf_info *ivf);
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index 78c8598..7268e8e 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -824,6 +824,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
>  				    .len = sizeof(struct ifla_vf_vlan) },
>  	[IFLA_VF_TX_RATE]	= { .type = NLA_BINARY,
>  				    .len = sizeof(struct ifla_vf_tx_rate) },
> +	[IFLA_VF_PORT_PROFILE]	= { .type = NLA_BINARY,
> +				    .len = sizeof(struct ifla_vf_port_profile)},
>  };
>  EXPORT_SYMBOL(ifla_policy);
>  
> @@ -1028,6 +1030,24 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
>  	}
>  	err = 0;
>  
> +	if (tb[IFLA_VF_PORT_PROFILE]) {
> +		struct ifla_vf_port_profile *ivp;
> +		ivp = nla_data(tb[IFLA_VF_PORT_PROFILE]);
> +		err = -EOPNOTSUPP;
> +		if (ops->ndo_set_vf_port_profile)
> +			ivp->port_profile[sizeof(ivp->port_profile)-1] = 0;
> +			ivp->host_uuid[sizeof(ivp->host_uuid)-1] = 0;
> +			ivp->client_uuid[sizeof(ivp->client_uuid)-1] = 0;
> +			ivp->client_name[sizeof(ivp->client_name)-1] = 0;

Seems a little unusual to modify the buffer, add a kernel internal structure
that can be passed to ndo callback (where buffer lens can be knonw)?

> +			err = ops->ndo_set_vf_port_profile(dev, ivp->vf,
> +				ivp->port_profile, ivp->mac, ivp->host_uuid,
> +				ivp->client_uuid, ivp->client_name);
> +		if (err < 0)
> +			goto errout;
> +		modified = 1;
> +	}
> +	err = 0;
> +
>  errout:
>  	if (err < 0 && modified && net_ratelimit())
>  		printk(KERN_WARNING "A link change request failed with "
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 24, 2010, 7:19 a.m. UTC | #2
From: Scott Feldman <scofeldm@cisco.com>
Date: Fri, 23 Apr 2010 17:35:41 -0700

> +
> +struct ifla_vf_port_profile {
> +	__u32 vf;
> +	__u8 port_profile[64];
> +	__u8 mac[32];
> +	__u8 host_uuid[64]; /* e.g. "CEEFD3B1-9E11-11DE-BDFD-000BAB01C0FB" */
> +	__u8 client_uuid[64];
> +	__u8 client_name[64]; /* e.g. "vm0-eth1" */
> +};
> +

Please define some macros to represent the string sizes.

;
>  	int			(*ndo_set_vf_tx_rate)(struct net_device *dev,
>  						      int vf, int rate);
> +	int			(*ndo_set_vf_port_profile)(
> +					struct net_device *dev, int vf,
> +					u8 *port_profile, u8 *mac,
> +					u8 *host_uuid,
> +					u8 *client_uuid,
> +					u8 *client_name);
>  	int			(*ndo_get_vf_config)(struct net_device *dev,

Just pass the "struct ifla_vf_port_profile *" instead of tons of
arguments.

Also, I think it's reasonable to fetch the current profile in use, so
GETLINK ought to report these things.  To make it generic we can
maintain the settings given to us in software, hung off of the netdev
struct, and simply report that during GETLINK.

In fact, in general, anything that can be "set" should always be
available from a "get".
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman April 24, 2010, 2:37 p.m. UTC | #3
On 4/23/10 7:22 PM, "Chris Wright" <chrisw@redhat.com> wrote:

>> I took some liberties and s/SR-IOV/IOV in the code comments around the
>> ndo_set_vf_* cmds as they can apply to both SR-IOV and non-SR-IOV adapters,
>> as long as there is a PF:VF parent:child relationship.
> 
> For enic case, which do you expect to use for net_dev and VF index?  Would
> this be VF + index== 0 (meaning the degenerate case you described last
> time where PF==VF)?

Yes, for this enic PF==VF, but that's a short term situation.  It's a small
matter of programming (in firmware) to turn enic into the more general case.
But I want to focus on getting port-profile support in first, with the
current enic+firmware.

>> A port-profile is used to configure/enable the network port backing the VF,
>> not
>> to configure the host-facing side of the VF.
> 
> How shall we do the lldpad case?

Same as before with iovnl.  The sender of RTM_SETLINK msg (say libvirt)
needs to send with mcast group RTMGRP_LINK and listener (say lldpad) needs
to listen on that mcast group.  This way, both kernel and user-space get the
msg.

>> + if (tb[IFLA_VF_PORT_PROFILE]) {
>> +  struct ifla_vf_port_profile *ivp;
>> +  ivp = nla_data(tb[IFLA_VF_PORT_PROFILE]);
>> +  err = -EOPNOTSUPP;
>> +  if (ops->ndo_set_vf_port_profile)
>> +   ivp->port_profile[sizeof(ivp->port_profile)-1] = 0;
>> +   ivp->host_uuid[sizeof(ivp->host_uuid)-1] = 0;
>> +   ivp->client_uuid[sizeof(ivp->client_uuid)-1] = 0;
>> +   ivp->client_name[sizeof(ivp->client_name)-1] = 0;
> 
> Seems a little unusual to modify the buffer, add a kernel internal structure
> that can be passed to ndo callback (where buffer lens can be knonw)?

Ok, let me see what can be done here.

-scott

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman April 26, 2010, 7:27 p.m. UTC | #4
On 4/24/10 12:19 AM, "David Miller" <davem@davemloft.net> wrote:

>> int   (*ndo_set_vf_tx_rate)(struct net_device *dev,
>>      int vf, int rate);
>> + int   (*ndo_set_vf_port_profile)(
>> +     struct net_device *dev, int vf,
>> +     u8 *port_profile, u8 *mac,
>> +     u8 *host_uuid,
>> +     u8 *client_uuid,
>> +     u8 *client_name);
>> int   (*ndo_get_vf_config)(struct net_device *dev,
> 
> Just pass the "struct ifla_vf_port_profile *" instead of tons of
> arguments.

Ok
 
> Also, I think it's reasonable to fetch the current profile in use, so
> GETLINK ought to report these things.  To make it generic we can
> maintain the settings given to us in software, hung off of the netdev
> struct, and simply report that during GETLINK.

We'd need an array of struct ifla_vf_port_profile hanging off of netdev, one
element for each VF.  That seems like a lot of mem hanging off of netdev,
and we'd have to define a MAX_VF to size the array.  How about a
ndo_get_vf_port_profile() that the netdev fills in, and the netdev keeps the
data in it's private area?  That's how ndo_get_vf_config() is working.
 
-scott

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman April 26, 2010, 7:57 p.m. UTC | #5
On 4/26/10 12:27 PM, "Scott Feldman" <scofeldm@cisco.com> wrote:
> On 4/24/10 12:19 AM, "David Miller" <davem@davemloft.net> wrote:
>> Also, I think it's reasonable to fetch the current profile in use, so
>> GETLINK ought to report these things.  To make it generic we can
>> maintain the settings given to us in software, hung off of the netdev
>> struct, and simply report that during GETLINK.
> 
> We'd need an array of struct ifla_vf_port_profile hanging off of netdev, one
> element for each VF.  That seems like a lot of mem hanging off of netdev,
> and we'd have to define a MAX_VF to size the array.  How about a
> ndo_get_vf_port_profile() that the netdev fills in, and the netdev keeps the
> data in it's private area?  That's how ndo_get_vf_config() is working.

Hmmm....even that isn't so nice because the port-profile info for all VFs is
going to get stuffed into the RTM_GETLINK skb, and I assume there are limits
on the skb return size.

Here's a proposal:

Currently we have RTM_GETLINK for

    ip link show [ DEVICE ]

This dumps everything for the DEVICE including info for each VF.  Let's
modify RTM_GETLINK to look like this:

    ip link show [ DEVICE [ vf NUM] ]

If you don't give the optional vf NUM you get base dump on DEVICE.  If you
give vf NUM, you get the VF-specific dump.  This scales much better with the
number of VFs.  

(Number of VFs can easily be > 128 in some designs).

Comments?

-scott

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 26, 2010, 8:24 p.m. UTC | #6
From: Scott Feldman <scofeldm@cisco.com>
Date: Mon, 26 Apr 2010 12:27:51 -0700

> We'd need an array of struct ifla_vf_port_profile hanging off of netdev, one
> element for each VF.  That seems like a lot of mem hanging off of netdev,
> and we'd have to define a MAX_VF to size the array.  How about a
> ndo_get_vf_port_profile() that the netdev fills in, and the netdev keeps the
> data in it's private area?  That's how ndo_get_vf_config() is working.

Sure if the device can do it, but for situations where it can't we can
use a linked list and dynamic memory allocation.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller April 26, 2010, 8:25 p.m. UTC | #7
From: Scott Feldman <scofeldm@cisco.com>
Date: Mon, 26 Apr 2010 12:57:06 -0700

> Hmmm....even that isn't so nice because the port-profile info for all VFs is
> going to get stuffed into the RTM_GETLINK skb, and I assume there are limits
> on the skb return size.

It's probably better to use .dump() for this, which allows to return
the result in multiple SKBs.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rose, Gregory V April 26, 2010, 10:38 p.m. UTC | #8
>-----Original Message-----
>From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
>On Behalf Of Scott Feldman
>Sent: Monday, April 26, 2010 12:57 PM
>To: Scott Feldman; David Miller
>Cc: netdev@vger.kernel.org; chrisw@redhat.com; arnd@arndb.de
>Subject: Re: [net-next-2.6 PATCH 1/2] Add ndo_set_vf_port_profile
>
>On 4/26/10 12:27 PM, "Scott Feldman" <scofeldm@cisco.com> wrote:
>> We'd need an array of struct ifla_vf_port_profile hanging off of
>netdev, one
>> element for each VF.  That seems like a lot of mem hanging off of
>netdev,
>> and we'd have to define a MAX_VF to size the array.  How about a
>> ndo_get_vf_port_profile() that the netdev fills in, and the netdev
>keeps the
>> data in it's private area?  That's how ndo_get_vf_config() is working.
>
>Hmmm....even that isn't so nice because the port-profile info for all
>VFs is
>going to get stuffed into the RTM_GETLINK skb, and I assume there are
>limits
>on the skb return size.
>
>Here's a proposal:
>
>Currently we have RTM_GETLINK for
>
>    ip link show [ DEVICE ]
>
>This dumps everything for the DEVICE including info for each VF.  Let's
>modify RTM_GETLINK to look like this:
>
>    ip link show [ DEVICE [ vf NUM] ]
>
>If you don't give the optional vf NUM you get base dump on DEVICE.  If
>you
>give vf NUM, you get the VF-specific dump.  This scales much better with
>the
>number of VFs.
>
>(Number of VFs can easily be > 128 in some designs).
>
>Comments?

It seems to me that this:

ip link show [ DEVICE ]

should at least return the number of VFs so
that you can make sure the subsequent usage of this:

ip link show [ DEVICE [ vf NUM ] ]

is still in range with the [ vf NUM ] parameter.  Otherwise you wouldn't know
what the range of NUM is.

Other than that I can see why you would want to limit the size of the dump
when using 'ip link show [ DEVICE ]'.

- Greg

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman April 26, 2010, 11:21 p.m. UTC | #9
On 4/26/10 3:38 PM, "Rose, Gregory V" <gregory.v.rose@intel.com> wrote:

>> On 4/26/10 12:27 PM, "Scott Feldman" <scofeldm@cisco.com> wrote:
>> Here's a proposal:
>> 
>> Currently we have RTM_GETLINK for
>> 
>>    ip link show [ DEVICE ]
>> 
>> This dumps everything for the DEVICE including info for each VF.  Let's
>> modify RTM_GETLINK to look like this:
>> 
>>    ip link show [ DEVICE [ vf NUM] ]
>> 
>> If you don't give the optional vf NUM you get base dump on DEVICE.  If
>> you
>> give vf NUM, you get the VF-specific dump.  This scales much better with
>> the
>> number of VFs.
>> 
>> (Number of VFs can easily be > 128 in some designs).
>> 
>> Comments?
> 
> It seems to me that this:
> 
> ip link show [ DEVICE ]
> 
> should at least return the number of VFs so
> that you can make sure the subsequent usage of this:

Yes, I believe that's there today:

    NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));

The number of VFs is returned in RTM_GETLINK.  But, it's only returned if:

    if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent)

For my proposal, I'll need to return IFLA_NUM_VF unconditionally so callers
can get num VFs.
 
> ip link show [ DEVICE [ vf NUM ] ]
> 
> is still in range with the [ vf NUM ] parameter.  Otherwise you wouldn't know
> what the range of NUM is.
> 
> Other than that I can see why you would want to limit the size of the dump
> when using 'ip link show [ DEVICE ]'.
> 
> - Greg
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman April 27, 2010, 12:03 a.m. UTC | #10
On 4/26/10 4:21 PM, "Scott Feldman" <scofeldm@cisco.com> wrote:

>> It seems to me that this:
>> 
>> ip link show [ DEVICE ]
>> 
>> should at least return the number of VFs so
>> that you can make sure the subsequent usage of this:
> 
> Yes, I believe that's there today:
> 
>     NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
> 
> The number of VFs is returned in RTM_GETLINK.  But, it's only returned if:
> 
>     if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent)
> 
> For my proposal, I'll need to return IFLA_NUM_VF unconditionally so callers
> can get num VFs.

Hmmm...seems IFLA_NUM_VF assumes a PCI device supporting SR-IOV when it uses
dev_num_vf().  I think a better option would have been to query the device
for the number of VFs, without assuming SR-IOV or even PCI.

I see a ndo_get_num_vf() coming...

-scott

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Wright April 27, 2010, 12:15 a.m. UTC | #11
* Scott Feldman (scofeldm@cisco.com) wrote:
> Hmmm...seems IFLA_NUM_VF assumes a PCI device supporting SR-IOV when it uses
> dev_num_vf().  I think a better option would have been to query the device
> for the number of VFs, without assuming SR-IOV or even PCI.
> 
> I see a ndo_get_num_vf() coming...

That was discussed when the patch originally merged.  The proposal to make
get/set symmetric.  Right now it's an unusual mix of sets + one
collective get.

thanks,
-chris
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann April 27, 2010, 12:35 p.m. UTC | #12
On Tuesday 27 April 2010, Scott Feldman wrote:
> > Yes, I believe that's there today:
> > 
> >     NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
> > 
> > The number of VFs is returned in RTM_GETLINK.  But, it's only returned if:
> > 
> >     if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent)
> > 
> > For my proposal, I'll need to return IFLA_NUM_VF unconditionally so callers
> > can get num VFs.
> 
> Hmmm...seems IFLA_NUM_VF assumes a PCI device supporting SR-IOV when it uses
> dev_num_vf().  I think a better option would have been to query the device
> for the number of VFs, without assuming SR-IOV or even PCI.
> 
> I see a ndo_get_num_vf() coming...

Shouldn't the number of registered port profiles be totally independent of
the number of virtual functions?

Any of the VFs could multiplex multiple guests using macvlan, which means you
need to register each guest separately, not each VF.

Anything that ties port profiles to VFs seems fundamentally flawed AFAICT,
at least when we want to extend this to adapters that don't do it in firmware.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anirban Chakraborty April 27, 2010, 5:33 p.m. UTC | #13
On Apr 27, 2010, at 5:35 AM, Arnd Bergmann wrote:

> On Tuesday 27 April 2010, Scott Feldman wrote:
>>> Yes, I believe that's there today:
>>> 
>>>    NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
>>> 
>>> The number of VFs is returned in RTM_GETLINK.  But, it's only returned if:
>>> 
>>>    if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent)
>>> 
>>> For my proposal, I'll need to return IFLA_NUM_VF unconditionally so callers
>>> can get num VFs.
>> 
>> Hmmm...seems IFLA_NUM_VF assumes a PCI device supporting SR-IOV when it uses
>> dev_num_vf().  I think a better option would have been to query the device
>> for the number of VFs, without assuming SR-IOV or even PCI.
>> 
>> I see a ndo_get_num_vf() coming...
> 
> Shouldn't the number of registered port profiles be totally independent of
> the number of virtual functions?
> 
> Any of the VFs could multiplex multiple guests using macvlan, which means you
> need to register each guest separately, not each VF.
> 
> Anything that ties port profiles to VFs seems fundamentally flawed AFAICT,
> at least when we want to extend this to adapters that don't do it in firmware.

Correct me if I am wrong. Shouldn't the port profile be tied to the physical NICs which are essentially
PCI functions (be it PF or VF)? I'd think that a port profile would have configuration settings for all the
physical NICs (PF/VF) of a specific physical port of the adapter. I liked the idea of querying the device
for number of VFs as it will cover both SR-IOV and non SR-IOV PCI functions.

thanks,
-Anirban--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann April 27, 2010, 7:38 p.m. UTC | #14
On Tuesday 27 April 2010 19:33:04 Anirban Chakraborty wrote:
> On Apr 27, 2010, at 5:35 AM, Arnd Bergmann wrote:
> > Anything that ties port profiles to VFs seems fundamentally flawed AFAICT,
> > at least when we want to extend this to adapters that don't do it in firmware.
> 
> Correct me if I am wrong. Shouldn't the port profile be tied to the physical NICs which are essentially
> PCI functions (be it PF or VF)? I'd think that a port profile would have configuration settings for all the
> physical NICs (PF/VF) of a specific physical port of the adapter. I liked the idea of querying the device
> for number of VFs as it will cover both SR-IOV and non SR-IOV PCI functions.

Yes, the port profile association is tied to whoever owns the link to the switch.
That can be a regular NIC, an SR-IOV PF, an ethernet bonding device or an S-component
implementing provider S-VLANs on top of any of these.

Usually it will be the same as a physical link, but in case of bonding it is two
physical links while in case of S-VLAN, you have multiple instances that each
have their own set of port profile association. If S-VLAN is implemented by
the NIC, that may be a VF.

Querying a PF for the number of VFs attached to it is a useful thing, but this
is independent of port profiles. Consider this (artificially complex) setup:

- eth0 is the PF of an SR-IOV NIC
- eth1 is a regular single-channel NIC
- vf0 is a VF of eth0, used by a guest using PCI passthrough mode on S-VLAN 2
- vf1 is a VF of eth0 owned by the host on S-VLAN 3
- vf1.23 is a VLAN port for VLAN 23 in S-VLAN 3
- br0 is a bridge connected to vf1
- br23 is a bridge VLAN device for br0
- vf2 is a VF of eth0 owned by the host on S-VLAN 4
- eth1.5 is a software vlan device for S-VLAN 4
- bond0 combines eth1.5 and vf2
- bond0.24 is a VLAN port for VLAN24 on bond0
- tap0 is a guest connected to br0 in trunk mode
- tap1 is a guest connected to br23 in access mode
- macvtap0 is a VEPA mode guest on bond0
- macvtap1 is a private mode guest on bond0.24

This means you have a total of five guests running, on vf0, tap0, tap1,
macvtap0 and macvtap1. Querying the number of VFs on eth0 will return '2',
for vf0 and vf1. What you are interested in however is which guests are
associated. Querying every single interface in the system will tell you

eth0: one guest (vf0)
vf1: two guests (tap0 and tap1)
bond0: two guests (macvtap0 and macvtap1)

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Scott Feldman April 27, 2010, 8:57 p.m. UTC | #15
On 4/27/10 5:35 AM, "Arnd Bergmann" <arnd@arndb.de> wrote:

> On Tuesday 27 April 2010, Scott Feldman wrote:
>>> Yes, I believe that's there today:
>>> 
>>>     NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent));
>>> 
>>> The number of VFs is returned in RTM_GETLINK.  But, it's only returned if:
>>> 
>>>     if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent)
>>> 
>>> For my proposal, I'll need to return IFLA_NUM_VF unconditionally so callers
>>> can get num VFs.
>> 
>> Hmmm...seems IFLA_NUM_VF assumes a PCI device supporting SR-IOV when it uses
>> dev_num_vf().  I think a better option would have been to query the device
>> for the number of VFs, without assuming SR-IOV or even PCI.
>> 
>> I see a ndo_get_num_vf() coming...
> 
> Shouldn't the number of registered port profiles be totally independent of
> the number of virtual functions?
> 
> Any of the VFs could multiplex multiple guests using macvlan, which means you
> need to register each guest separately, not each VF.
> 
> Anything that ties port profiles to VFs seems fundamentally flawed AFAICT,
> at least when we want to extend this to adapters that don't do it in firmware.

Ya, I tend I agree.  Let's just make port-profile a setting of any netdev,
an eth, macvtap, eth.x, bond, etc.  That's probably what I should have done
in the first place.  Something like:

       ip link set DEVICE [ { up | down } ]
                          [ arp { on | off } ]
                            <...clip...>
                          [ alias NAME ]
                          [ vf NUM [ mac LLADDR ]
                                   [ vlan VLANID [ qos VLAN-QOS ] ]
                                   [ rate TXRATE ] ]
                          [ port_profile [ PORT-PROFILE
                                   [ mac LLADDR ]
                                   [ host_uuid HOST_UUID ]
                                   [ client_uuid CLIENT_UUID ]
                                   [ client_name CLIENT_NAME ] ] ] ]
       ip link show [ DEVICE ]

I think I was trying to be too accommodating for models with VFs, but it
doesn't matter like you point out.

This way, I can get the RTM_GETLINK to return the port-profile in use.

New patches coming soon...

-scott

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index cfd420b..2c5cc65 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -110,12 +110,13 @@  enum {
 #define IFLA_LINKINFO IFLA_LINKINFO
 	IFLA_NET_NS_PID,
 	IFLA_IFALIAS,
-	IFLA_NUM_VF,		/* Number of VFs if device is SR-IOV PF */
+	IFLA_NUM_VF,		/* Number of VFs if device is IOV PF */
 	IFLA_VF_MAC,		/* Hardware queue specific attributes */
 	IFLA_VF_VLAN,
 	IFLA_VF_TX_RATE,	/* TX Bandwidth Allocation */
 	IFLA_VFINFO,
 	IFLA_STATS64,
+	IFLA_VF_PORT_PROFILE,
 	__IFLA_MAX
 };
 
@@ -234,7 +235,7 @@  enum macvlan_mode {
 	MACVLAN_MODE_BRIDGE  = 4, /* talk to bridge ports directly */
 };
 
-/* SR-IOV virtual function managment section */
+/* IOV virtual function managment section */
 
 struct ifla_vf_mac {
 	__u32 vf;
@@ -259,4 +260,14 @@  struct ifla_vf_info {
 	__u32 qos;
 	__u32 tx_rate;
 };
+
+struct ifla_vf_port_profile {
+	__u32 vf;
+	__u8 port_profile[64];
+	__u8 mac[32];
+	__u8 host_uuid[64]; /* e.g. "CEEFD3B1-9E11-11DE-BDFD-000BAB01C0FB" */
+	__u8 client_uuid[64];
+	__u8 client_name[64]; /* e.g. "vm0-eth1" */
+};
+
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3c5ed5f..26dd4cb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -690,10 +690,13 @@  struct netdev_rx_queue {
  *
  * void (*ndo_poll_controller)(struct net_device *dev);
  *
- *	SR-IOV management functions.
+ *	IOV management functions.
  * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
  * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
  * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate);
+ * int (*ndo_set_vf_port_profile)(struct net_device *dev, int vf,
+ *				  u8 *port_profile, u8 *mac, u8 *host_uuid,
+ *				  u8 *client_uuid, u8 *client_name);
  * int (*ndo_get_vf_config)(struct net_device *dev,
  *			    int vf, struct ifla_vf_info *ivf);
  */
@@ -741,6 +744,12 @@  struct net_device_ops {
 						   int queue, u16 vlan, u8 qos);
 	int			(*ndo_set_vf_tx_rate)(struct net_device *dev,
 						      int vf, int rate);
+	int			(*ndo_set_vf_port_profile)(
+					struct net_device *dev, int vf,
+					u8 *port_profile, u8 *mac,
+					u8 *host_uuid,
+					u8 *client_uuid,
+					u8 *client_name);
 	int			(*ndo_get_vf_config)(struct net_device *dev,
 						     int vf,
 						     struct ifla_vf_info *ivf);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 78c8598..7268e8e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -824,6 +824,8 @@  const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 				    .len = sizeof(struct ifla_vf_vlan) },
 	[IFLA_VF_TX_RATE]	= { .type = NLA_BINARY,
 				    .len = sizeof(struct ifla_vf_tx_rate) },
+	[IFLA_VF_PORT_PROFILE]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct ifla_vf_port_profile)},
 };
 EXPORT_SYMBOL(ifla_policy);
 
@@ -1028,6 +1030,24 @@  static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 	}
 	err = 0;
 
+	if (tb[IFLA_VF_PORT_PROFILE]) {
+		struct ifla_vf_port_profile *ivp;
+		ivp = nla_data(tb[IFLA_VF_PORT_PROFILE]);
+		err = -EOPNOTSUPP;
+		if (ops->ndo_set_vf_port_profile)
+			ivp->port_profile[sizeof(ivp->port_profile)-1] = 0;
+			ivp->host_uuid[sizeof(ivp->host_uuid)-1] = 0;
+			ivp->client_uuid[sizeof(ivp->client_uuid)-1] = 0;
+			ivp->client_name[sizeof(ivp->client_name)-1] = 0;
+			err = ops->ndo_set_vf_port_profile(dev, ivp->vf,
+				ivp->port_profile, ivp->mac, ivp->host_uuid,
+				ivp->client_uuid, ivp->client_name);
+		if (err < 0)
+			goto errout;
+		modified = 1;
+	}
+	err = 0;
+
 errout:
 	if (err < 0 && modified && net_ratelimit())
 		printk(KERN_WARNING "A link change request failed with "