diff mbox

[RFC] net: store port/representative id in metadata_dst

Message ID 1474572417-15907-1-git-send-email-jakub.kicinski@netronome.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Jakub Kicinski Sept. 22, 2016, 7:26 p.m. UTC
Switches and modern SR-IOV enabled NICs may multiplex traffic from
representators and control messages over single set of hardware queues.
Control messages and muxed traffic may need ordered delivery.

Those requirements make it hard to comfortably use TC infrastructure
today unless we have a way of attaching metadata to skbs at the upper
device.  Because single set of queues is used for many netdevs stopping
TC/sched queues of all of them reliably is impossible and lower
device has to retreat to returning NETDEV_TX_BUSY and usually
has to take extra locks on the fastpath.

This patch attempts to enable port/representative devs to attach metadata
to skbs which carry port id.  This way representatives can be queueless
and all queuing can be performed at the lower netdev in the usual way.

Traffic arriving on the port/representative interfaces will be have 
metadata attached and will subsequently be queued to the lower device
for transmission.  The lower device should recognize the metadata and
translate it to HW specific format which is most likely either a special
header inserted before the network headers or descriptor/metadata fields.

Metadata is associated with the lower device by storing the netdev pointer
along with port id so that if TC decides to redirect or mirror the new 
netdev will not try to interpret it.

This is mostly for SR-IOV devices since switches don't have lower
netdevs today.

Since I don't have any real user in the tree at this point please
allow me to present a trivial example use here:

void upper_init(struct upper *upper, struct lower *lower, unsigned int id)
{
	upper->lower_dev = lower;

	upper->dst_meta = metadata_dst_alloc(0, METADATA_HW_PORT_MUX,
					     GFP_KERNEL);
	upper->dst_meta.u.lower_dev = lower->netdev;
	upper->dst_meta.u.port_info.port_id = id;
}

int upper_tx(struct sk_buff *skb, struct net_device *netdev)
{
	struct upper *upper = netdev_priv(netdev);

	skb_dst_drop(skb);
	skb_dst_set_noref(skb, upper->dst_meta);

	return dev_queue_xmit(upper->lower_dev, skb);
}

int lower_tx(struct sk_buff *skb, struct net_device *netdev)
{
	struct metadata_dst *md = skb_metadata_dst(skb);
	struct lower *lower = netdev_priv(netdev);

	if (md->type == METADATA_HW_PORT_MUX &&
	    md->u.lower_dev == netdev) {
		/* use md->u.port_id to set port in
		 * descriptor/metadata/do encap
		 */
	}
	...
}

Other approaches considered but found inferior:
 - in-data tags - inserting tags into data will be confusing
   to classifiers which start parsing from mac headers, also
   in-band data is less perfect and allows sufficiently privileged
   user to inject control messages from userspace (this is DSA model
   - note that in SR-IOV switchdev mode I control both upper and lower
   device which differs from DSA where lower device can be any MAC);
 - per-VFR HW queues - requiring a queue per VF is a little wasteful and
   less scalable, muxing allows us to use all PF queues to transmit
   and receive with full RSS (this is model of existing SR-IOV switchdev
   mode implementations);
 - per-VFR TC queue - we could use per-VFR queue in the lower device,
   tag traffic and TX on smaller set of HW queues but again scaling
   would suffer, we would need to lock an extra queue and we have no
   way to stop all queues when HW queues fill up reliably (this model
   would piggy back on dev_queue_xmit_accel() to select queue).


Any comments, reactions would be much appreciated!

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/dst_metadata.h     | 35 ++++++++++++++++++++++++++++-------
 net/core/dst.c                 | 14 +++++++++-----
 net/core/filter.c              |  1 +
 net/ipv4/ip_tunnel_core.c      |  5 +++--
 net/openvswitch/flow_netlink.c |  3 ++-
 5 files changed, 43 insertions(+), 15 deletions(-)

Comments

Jiri Pirko Sept. 23, 2016, 6:34 a.m. UTC | #1
Thu, Sep 22, 2016 at 09:26:57PM CEST, jakub.kicinski@netronome.com wrote:
>Switches and modern SR-IOV enabled NICs may multiplex traffic from
>representators and control messages over single set of hardware queues.
>Control messages and muxed traffic may need ordered delivery.
>
>Those requirements make it hard to comfortably use TC infrastructure
>today unless we have a way of attaching metadata to skbs at the upper
>device.  Because single set of queues is used for many netdevs stopping
>TC/sched queues of all of them reliably is impossible and lower
>device has to retreat to returning NETDEV_TX_BUSY and usually
>has to take extra locks on the fastpath.
>
>This patch attempts to enable port/representative devs to attach metadata
>to skbs which carry port id.  This way representatives can be queueless
>and all queuing can be performed at the lower netdev in the usual way.
>
>Traffic arriving on the port/representative interfaces will be have 
>metadata attached and will subsequently be queued to the lower device
>for transmission.  The lower device should recognize the metadata and
>translate it to HW specific format which is most likely either a special
>header inserted before the network headers or descriptor/metadata fields.
>
>Metadata is associated with the lower device by storing the netdev pointer
>along with port id so that if TC decides to redirect or mirror the new 
>netdev will not try to interpret it.
>
>This is mostly for SR-IOV devices since switches don't have lower
>netdevs today.
>
>Since I don't have any real user in the tree at this point please
>allow me to present a trivial example use here:
>
>void upper_init(struct upper *upper, struct lower *lower, unsigned int id)
>{
>	upper->lower_dev = lower;
>
>	upper->dst_meta = metadata_dst_alloc(0, METADATA_HW_PORT_MUX,
>					     GFP_KERNEL);
>	upper->dst_meta.u.lower_dev = lower->netdev;
>	upper->dst_meta.u.port_info.port_id = id;
>}
>
>int upper_tx(struct sk_buff *skb, struct net_device *netdev)
>{
>	struct upper *upper = netdev_priv(netdev);
>
>	skb_dst_drop(skb);
>	skb_dst_set_noref(skb, upper->dst_meta);
>
>	return dev_queue_xmit(upper->lower_dev, skb);
>}
>
>int lower_tx(struct sk_buff *skb, struct net_device *netdev)
>{
>	struct metadata_dst *md = skb_metadata_dst(skb);
>	struct lower *lower = netdev_priv(netdev);
>
>	if (md->type == METADATA_HW_PORT_MUX &&
>	    md->u.lower_dev == netdev) {

What else would it be?


>		/* use md->u.port_id to set port in
>		 * descriptor/metadata/do encap
>		 */
>	}
>	...
>}

So if I understand that correctly, this would need some "shared netdev"
which would effectively serve only as a sink for all port netdevices to
tx packets to. On RX, this would be completely avoided. This lower
device looks like half zombie to me. I don't like it :( I wonder if the
solution would not be possible without this lower netdev.

Btw, for the example implementation, you can use mlxsw, as we have exactly
the same scenario there as you describe.

Thanks.

Jiri

>
>Other approaches considered but found inferior:
> - in-data tags - inserting tags into data will be confusing
>   to classifiers which start parsing from mac headers, also
>   in-band data is less perfect and allows sufficiently privileged
>   user to inject control messages from userspace (this is DSA model
>   - note that in SR-IOV switchdev mode I control both upper and lower
>   device which differs from DSA where lower device can be any MAC);
> - per-VFR HW queues - requiring a queue per VF is a little wasteful and
>   less scalable, muxing allows us to use all PF queues to transmit
>   and receive with full RSS (this is model of existing SR-IOV switchdev
>   mode implementations);
> - per-VFR TC queue - we could use per-VFR queue in the lower device,
>   tag traffic and TX on smaller set of HW queues but again scaling
>   would suffer, we would need to lock an extra queue and we have no
>   way to stop all queues when HW queues fill up reliably (this model
>   would piggy back on dev_queue_xmit_accel() to select queue).
>
>
>Any comments, reactions would be much appreciated!
>
>Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>---
> include/net/dst_metadata.h     | 35 ++++++++++++++++++++++++++++-------
> net/core/dst.c                 | 14 +++++++++-----
> net/core/filter.c              |  1 +
> net/ipv4/ip_tunnel_core.c      |  5 +++--
> net/openvswitch/flow_netlink.c |  3 ++-
> 5 files changed, 43 insertions(+), 15 deletions(-)
>
>diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
>index 6965c8f68ade..6d7e1e4f3acd 100644
>--- a/include/net/dst_metadata.h
>+++ b/include/net/dst_metadata.h
>@@ -5,10 +5,22 @@
> #include <net/ip_tunnels.h>
> #include <net/dst.h>
> 
>+enum metadata_type {
>+	METADATA_IP_TUNNEL,
>+	METADATA_HW_PORT_MUX,
>+};
>+
>+struct hw_port_info {
>+	struct netdevice *lower_dev;
>+	u32 port_id;
>+};
>+
> struct metadata_dst {
> 	struct dst_entry		dst;
>+	enum metadata_type		type;
> 	union {
> 		struct ip_tunnel_info	tun_info;
>+		struct hw_port_info	port_info;
> 	} u;
> };
> 
>@@ -27,7 +39,7 @@ static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
> 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
> 	struct dst_entry *dst;
> 
>-	if (md_dst)
>+	if (md_dst && md_dst->type == METADATA_IP_TUNNEL)
> 		return &md_dst->u.tun_info;
> 
> 	dst = skb_dst(skb);
>@@ -55,7 +67,14 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
> 	a = (const struct metadata_dst *) skb_dst(skb_a);
> 	b = (const struct metadata_dst *) skb_dst(skb_b);
> 
>-	if (!a != !b || a->u.tun_info.options_len != b->u.tun_info.options_len)
>+	if (!a != !b || a->type != b->type)
>+		return 1;
>+
>+	if (a->type == METADATA_HW_PORT_MUX)
>+		return memcmp(&a->u.port_info, &b->u.port_info,
>+			      sizeof(a->u.port_info));
>+
>+	if (a->u.tun_info.options_len != b->u.tun_info.options_len)
> 		return 1;
> 
> 	return memcmp(&a->u.tun_info, &b->u.tun_info,
>@@ -63,14 +82,16 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
> }
> 
> void metadata_dst_free(struct metadata_dst *);
>-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
>-struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
>+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
>+					gfp_t flags);
>+struct metadata_dst __percpu *
>+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);
> 
> static inline struct metadata_dst *tun_rx_dst(int md_size)
> {
> 	struct metadata_dst *tun_dst;
> 
>-	tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC);
>+	tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
> 	if (!tun_dst)
> 		return NULL;
> 
>@@ -85,11 +106,11 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
> 	int md_size;
> 	struct metadata_dst *new_md;
> 
>-	if (!md_dst)
>+	if (!md_dst || md_dst->type != METADATA_IP_TUNNEL)
> 		return ERR_PTR(-EINVAL);
> 
> 	md_size = md_dst->u.tun_info.options_len;
>-	new_md = metadata_dst_alloc(md_size, GFP_ATOMIC);
>+	new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
> 	if (!new_md)
> 		return ERR_PTR(-ENOMEM);
> 
>diff --git a/net/core/dst.c b/net/core/dst.c
>index b5cbbe07f786..dc8c0c0b197b 100644
>--- a/net/core/dst.c
>+++ b/net/core/dst.c
>@@ -367,7 +367,8 @@ static int dst_md_discard(struct sk_buff *skb)
> 	return 0;
> }
> 
>-static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
>+static void __metadata_dst_init(struct metadata_dst *md_dst,
>+				enum metadata_type type, u8 optslen)
> {
> 	struct dst_entry *dst;
> 
>@@ -379,9 +380,11 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
> 	dst->output = dst_md_discard_out;
> 
> 	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
>+	md_dst->type = type;
> }
> 
>-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
>+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
>+					gfp_t flags)
> {
> 	struct metadata_dst *md_dst;
> 
>@@ -389,7 +392,7 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
> 	if (!md_dst)
> 		return NULL;
> 
>-	__metadata_dst_init(md_dst, optslen);
>+	__metadata_dst_init(md_dst, type, optslen);
> 
> 	return md_dst;
> }
>@@ -403,7 +406,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
> 	kfree(md_dst);
> }
> 
>-struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
>+struct metadata_dst __percpu *
>+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
> {
> 	int cpu;
> 	struct metadata_dst __percpu *md_dst;
>@@ -414,7 +418,7 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
> 		return NULL;
> 
> 	for_each_possible_cpu(cpu)
>-		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
>+		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
> 
> 	return md_dst;
> }
>diff --git a/net/core/filter.c b/net/core/filter.c
>index 0920c2ac1d00..61536a7e932e 100644
>--- a/net/core/filter.c
>+++ b/net/core/filter.c
>@@ -2386,6 +2386,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
> 		 * that is holding verifier mutex.
> 		 */
> 		md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
>+						   METADATA_IP_TUNNEL,
> 						   GFP_KERNEL);
> 		if (!md_dst)
> 			return NULL;
>diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
>index 777bc1883870..12ffbc4a4daa 100644
>--- a/net/ipv4/ip_tunnel_core.c
>+++ b/net/ipv4/ip_tunnel_core.c
>@@ -145,10 +145,11 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
> 	struct metadata_dst *res;
> 	struct ip_tunnel_info *dst, *src;
> 
>-	if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
>+	if (!md || md->type != METADATA_IP_TUNNEL ||
>+	    md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
> 		return NULL;
> 
>-	res = metadata_dst_alloc(0, flags);
>+	res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags);
> 	if (!res)
> 		return NULL;
> 
>diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
>index ae25ded82b3b..c9971701d0af 100644
>--- a/net/openvswitch/flow_netlink.c
>+++ b/net/openvswitch/flow_netlink.c
>@@ -2072,7 +2072,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
> 	if (start < 0)
> 		return start;
> 
>-	tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL);
>+	tun_dst = metadata_dst_alloc(key.tun_opts_len, METADATA_IP_TUNNEL,
>+				     GFP_KERNEL);
> 	if (!tun_dst)
> 		return -ENOMEM;
> 
>-- 
>1.9.1
>
Jiri Benc Sept. 23, 2016, 9:06 a.m. UTC | #2
On Fri, 23 Sep 2016 08:34:29 +0200, Jiri Pirko wrote:
> So if I understand that correctly, this would need some "shared netdev"
> which would effectively serve only as a sink for all port netdevices to
> tx packets to. On RX, this would be completely avoided. This lower
> device looks like half zombie to me.

Looks more like a quarter zombie. Even tx would not be allowed unless
going through one of the ports, as all skbs without
METADATA_HW_PORT_MUX metadata_dst would be dropped. But it would be
possible to attach qdisc to the "lower" netdevice and it would actually
have an effect. On rx this netdevice would be ignored completely. This
is very weird behavior.

> I don't like it :( I wonder if the
> solution would not be possible without this lower netdev.

I agree. This approach doesn't sound correct. The skbs should not be
requeued.

 Jiri
Jakub Kicinski Sept. 23, 2016, 12:55 p.m. UTC | #3
On Fri, 23 Sep 2016 11:06:09 +0200, Jiri Benc wrote:
> On Fri, 23 Sep 2016 08:34:29 +0200, Jiri Pirko wrote:
> > So if I understand that correctly, this would need some "shared netdev"
> > which would effectively serve only as a sink for all port netdevices to
> > tx packets to. On RX, this would be completely avoided. This lower
> > device looks like half zombie to me.  
> 
> Looks more like a quarter zombie. Even tx would not be allowed unless
> going through one of the ports, as all skbs without
> METADATA_HW_PORT_MUX metadata_dst would be dropped. But it would be
> possible to attach qdisc to the "lower" netdevice and it would actually
> have an effect. On rx this netdevice would be ignored completely. This
> is very weird behavior.
> 
> > I don't like it :( I wonder if the
> > solution would not be possible without this lower netdev.  
> 
> I agree. This approach doesn't sound correct. The skbs should not be
> requeued.

Thanks for the responses!

I think SR-IOV NICs are coming at this problem from a different angle,
we already have a big, feature-full per-port netdevs and now we want to
spawn representators for VFs to handle fallback traffic.  This patch
would help us mux VFR traffic on all the queues of the physical port
netdevs (the ones which were already present in legacy mode, that's the
lower device).

I read the mlxsw code when I was thinking about this and I wasn't
100% comfortable with returning NETDEV_TX_BUSY, I thought this
behaviour should be generally avoided.  (BTW a very lame question - does
mlxsw ever stop the queues?  AFAICS it only returns BUSY, isn't that
confusing to the stack?)

FWIW the switchdev SR-IOV model we have now seems to be to treat the
existing netdevs as "MAC ports" and spawn representatives for VFs but
not represent PFs in any way.  This makes it impossible to install
VF-PF flow rules.  I worry this can bite us later but that's slightly
different discussion :)  For the purpose of this patch please assume
the lower dev is the MAC/physical/external port.
John Fastabend Sept. 23, 2016, 2:23 p.m. UTC | #4
On 16-09-23 05:55 AM, Jakub Kicinski wrote:
> On Fri, 23 Sep 2016 11:06:09 +0200, Jiri Benc wrote:
>> On Fri, 23 Sep 2016 08:34:29 +0200, Jiri Pirko wrote:
>>> So if I understand that correctly, this would need some "shared netdev"
>>> which would effectively serve only as a sink for all port netdevices to
>>> tx packets to. On RX, this would be completely avoided. This lower
>>> device looks like half zombie to me.  
>>
>> Looks more like a quarter zombie. Even tx would not be allowed unless
>> going through one of the ports, as all skbs without
>> METADATA_HW_PORT_MUX metadata_dst would be dropped. But it would be
>> possible to attach qdisc to the "lower" netdevice and it would actually
>> have an effect. On rx this netdevice would be ignored completely. This
>> is very weird behavior.
>>
>>> I don't like it :( I wonder if the
>>> solution would not be possible without this lower netdev.  
>>
>> I agree. This approach doesn't sound correct. The skbs should not be
>> requeued.
> 
> Thanks for the responses!

Nice timing we were just thinking about this.

> 
> I think SR-IOV NICs are coming at this problem from a different angle,
> we already have a big, feature-full per-port netdevs and now we want to
> spawn representators for VFs to handle fallback traffic.  This patch
> would help us mux VFR traffic on all the queues of the physical port
> netdevs (the ones which were already present in legacy mode, that's the
> lower device).

Yep, I like the idea in general. I had a slightly different approach in
mind though. If you look at __dev_queue_xmit() there is a void
accel_priv pointer (gather you found this based on your commit note).
My take was we could extend this a bit so it can be used by the VFR
devices and they could do a dev_queue_xmit_accel(). In this way there is
no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
accel logic needs to be extended to push the priv pointer all the way
through the xmit routine of the target netdev though. This should look
a lot like the macvlan accelerated xmit device path without the
switching logic.

Of course maybe the name would be extended to dev_queue_xmit_extended()
or something.

So the flow on ingress would be,

  1. pkt_received_by_PF_netdev
  2. PF_netdev reads some tag off packet/descriptor and sets correct
     skb->dev field. This is needed so stack "sees" packets from
     correct VF ports.
  3. packet passed up to stack.

I guess it is a bit "zombie" like on the receive path because the packet
is never actually handled by VF netdev code per se and on egress can
traverse both the VFR and PF netdevs qdiscs. But on the other hand the
VFR netdevs and PF netdevs are all in the same driver. Plus using a
queue per VFR is a bit of a waste as its not needed and also hardware
may not have any mechanism to push VF traffic onto a rx queue.

On egress,

  1. VFR xmit is called
  2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
     for the lower netdev
  3. lower netdev sends out the packet.

Again we don't need to waste any queues for each VFR and the VFR can be
a LLTX device. In this scheme I think you avoid much of the changes in
your patch and keep it all contained in the driver. Any thoughts?

To address 'I wonder if the solution can be done without this lower
netdev' I think it can be but it creates two issues which I'm not sure
have a good solution.

Without a lowerdev we either (a) give each VFR its own queue which I
don't like because it complicates mgmt and uses resources or (b) we
implicitly share queues. The later could be fine it just looks a bit
cleaner IMO to make it explicit.

With regard to VF-PF flow rules if we allow matching on ingress port
then can all your flow rules be pushed through the PF netdevices or
if you want any of the VFR netdevs? After all I expsect the flow rule
table is actually a shared resource between all attached ports.

Thanks,
.John





> 
> I read the mlxsw code when I was thinking about this and I wasn't
> 100% comfortable with returning NETDEV_TX_BUSY, I thought this
> behaviour should be generally avoided.  (BTW a very lame question - does
> mlxsw ever stop the queues?  AFAICS it only returns BUSY, isn't that
> confusing to the stack?)
> 
> FWIW the switchdev SR-IOV model we have now seems to be to treat the
> existing netdevs as "MAC ports" and spawn representatives for VFs but
> not represent PFs in any way.  This makes it impossible to install
> VF-PF flow rules.  I worry this can bite us later but that's slightly
> different discussion :)  For the purpose of this patch please assume
> the lower dev is the MAC/physical/external port.
>
Jakub Kicinski Sept. 23, 2016, 3:29 p.m. UTC | #5
On Fri, 23 Sep 2016 07:23:26 -0700, John Fastabend wrote:
> On 16-09-23 05:55 AM, Jakub Kicinski wrote:
> > On Fri, 23 Sep 2016 11:06:09 +0200, Jiri Benc wrote:  
> >> On Fri, 23 Sep 2016 08:34:29 +0200, Jiri Pirko wrote:  
> >>> So if I understand that correctly, this would need some "shared netdev"
> >>> which would effectively serve only as a sink for all port netdevices to
> >>> tx packets to. On RX, this would be completely avoided. This lower
> >>> device looks like half zombie to me.    
> >>
> >> Looks more like a quarter zombie. Even tx would not be allowed unless
> >> going through one of the ports, as all skbs without
> >> METADATA_HW_PORT_MUX metadata_dst would be dropped. But it would be
> >> possible to attach qdisc to the "lower" netdevice and it would actually
> >> have an effect. On rx this netdevice would be ignored completely. This
> >> is very weird behavior.
> >>  
> >>> I don't like it :( I wonder if the
> >>> solution would not be possible without this lower netdev.    
> >>
> >> I agree. This approach doesn't sound correct. The skbs should not be
> >> requeued.  
> > 
> > Thanks for the responses!  
> 
> Nice timing we were just thinking about this.
> 
> > 
> > I think SR-IOV NICs are coming at this problem from a different angle,
> > we already have a big, feature-full per-port netdevs and now we want to
> > spawn representators for VFs to handle fallback traffic.  This patch
> > would help us mux VFR traffic on all the queues of the physical port
> > netdevs (the ones which were already present in legacy mode, that's the
> > lower device).  
> 
> Yep, I like the idea in general. I had a slightly different approach in
> mind though. If you look at __dev_queue_xmit() there is a void
> accel_priv pointer (gather you found this based on your commit note).
> My take was we could extend this a bit so it can be used by the VFR
> devices and they could do a dev_queue_xmit_accel(). In this way there is
> no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
> accel logic needs to be extended to push the priv pointer all the way
> through the xmit routine of the target netdev though. This should look
> a lot like the macvlan accelerated xmit device path without the
> switching logic.
> 
> Of course maybe the name would be extended to dev_queue_xmit_extended()
> or something.
> 
> So the flow on ingress would be,
> 
>   1. pkt_received_by_PF_netdev
>   2. PF_netdev reads some tag off packet/descriptor and sets correct
>      skb->dev field. This is needed so stack "sees" packets from
>      correct VF ports.
>   3. packet passed up to stack.
> 
> I guess it is a bit "zombie" like on the receive path because the packet
> is never actually handled by VF netdev code per se and on egress can
> traverse both the VFR and PF netdevs qdiscs. But on the other hand the
> VFR netdevs and PF netdevs are all in the same driver. Plus using a
> queue per VFR is a bit of a waste as its not needed and also hardware
> may not have any mechanism to push VF traffic onto a rx queue.
> 
> On egress,
> 
>   1. VFR xmit is called
>   2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
>      for the lower netdev
>   3. lower netdev sends out the packet.
> 
> Again we don't need to waste any queues for each VFR and the VFR can be
> a LLTX device. In this scheme I think you avoid much of the changes in
> your patch and keep it all contained in the driver. Any thoughts?

Goes without saying that you have a much better understanding of packet
scheduling so please bear with me :)  My target model is that I have
n_cpus x "n_tc/prio" queues on the PF and I want to transmit the
fallback traffic over those same queues.  So no new HW queues are used
for VFRs at all.  This is a reverse of macvlan offload which AFAICT has
"bastard hw queues" which actually TX for a separate software device.

My understanding was that I can rework this model to have software
queues for VFRs (#sw queues == #PF queues + #VFRs) but no extra HW
queues (#hw queues == #PF queues) but then when the driver sees a
packet on sw-only VFR queue it has to pick one of the PF queues (which
one?), lock PF software queue to own it, and only then can it
transmit.  With the dst_metadata there is no need for extra locking or
queue selection.

> To address 'I wonder if the solution can be done without this lower
> netdev' I think it can be but it creates two issues which I'm not sure
> have a good solution.
> 
> Without a lowerdev we either (a) give each VFR its own queue which I
> don't like because it complicates mgmt and uses resources or (b) we
> implicitly share queues. The later could be fine it just looks a bit
> cleaner IMO to make it explicit.
> 
> With regard to VF-PF flow rules if we allow matching on ingress port
> then can all your flow rules be pushed through the PF netdevices or
> if you want any of the VFR netdevs? After all I expsect the flow rule
> table is actually a shared resource between all attached ports.

With the VF-PF forwarding rules I was just inching towards re-opening
the discussion on whether there should be an CPU port netdev.  I guess
there are good reasons why there isn't so maybe let's not go there :)
The meaning of PF netdevs in SR-IOV switchdev mode is "external ports"
AFAICT which could make it cumbersome to reach the host.
Samudrala, Sridhar Sept. 23, 2016, 5:22 p.m. UTC | #6
On 9/23/2016 8:29 AM, Jakub Kicinski wrote:
> On Fri, 23 Sep 2016 07:23:26 -0700, John Fastabend wrote:
>> On 16-09-23 05:55 AM, Jakub Kicinski wrote:
>>> On Fri, 23 Sep 2016 11:06:09 +0200, Jiri Benc wrote:
>>>> On Fri, 23 Sep 2016 08:34:29 +0200, Jiri Pirko wrote:
>>>>> So if I understand that correctly, this would need some "shared netdev"
>>>>> which would effectively serve only as a sink for all port netdevices to
>>>>> tx packets to. On RX, this would be completely avoided. This lower
>>>>> device looks like half zombie to me.
>>>> Looks more like a quarter zombie. Even tx would not be allowed unless
>>>> going through one of the ports, as all skbs without
>>>> METADATA_HW_PORT_MUX metadata_dst would be dropped. But it would be
>>>> possible to attach qdisc to the "lower" netdevice and it would actually
>>>> have an effect. On rx this netdevice would be ignored completely. This
>>>> is very weird behavior.
>>>>   
>>>>> I don't like it :( I wonder if the
>>>>> solution would not be possible without this lower netdev.
>>>> I agree. This approach doesn't sound correct. The skbs should not be
>>>> requeued.
>>> Thanks for the responses!
>> Nice timing we were just thinking about this.
>>
>>> I think SR-IOV NICs are coming at this problem from a different angle,
>>> we already have a big, feature-full per-port netdevs and now we want to
>>> spawn representators for VFs to handle fallback traffic.  This patch
>>> would help us mux VFR traffic on all the queues of the physical port
>>> netdevs (the ones which were already present in legacy mode, that's the
>>> lower device).
>> Yep, I like the idea in general. I had a slightly different approach in
>> mind though. If you look at __dev_queue_xmit() there is a void
>> accel_priv pointer (gather you found this based on your commit note).
>> My take was we could extend this a bit so it can be used by the VFR
>> devices and they could do a dev_queue_xmit_accel(). In this way there is
>> no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
>> accel logic needs to be extended to push the priv pointer all the way
>> through the xmit routine of the target netdev though. This should look
>> a lot like the macvlan accelerated xmit device path without the
>> switching logic.
>>
>> Of course maybe the name would be extended to dev_queue_xmit_extended()
>> or something.
>>
>> So the flow on ingress would be,
>>
>>    1. pkt_received_by_PF_netdev
>>    2. PF_netdev reads some tag off packet/descriptor and sets correct
>>       skb->dev field. This is needed so stack "sees" packets from
>>       correct VF ports.
>>    3. packet passed up to stack.
>>
>> I guess it is a bit "zombie" like on the receive path because the packet
>> is never actually handled by VF netdev code per se and on egress can
>> traverse both the VFR and PF netdevs qdiscs. But on the other hand the
>> VFR netdevs and PF netdevs are all in the same driver. Plus using a
>> queue per VFR is a bit of a waste as its not needed and also hardware
>> may not have any mechanism to push VF traffic onto a rx queue.
>>
>> On egress,
>>
>>    1. VFR xmit is called
>>    2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
>>       for the lower netdev
>>    3. lower netdev sends out the packet.
>>
>> Again we don't need to waste any queues for each VFR and the VFR can be
>> a LLTX device. In this scheme I think you avoid much of the changes in
>> your patch and keep it all contained in the driver. Any thoughts?

The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
queue.
Also, it is not passed all the way to the driver specific xmit routine.  
Doesn't it require
changing all the driver xmit routines if we want to pass this parameter?

> Goes without saying that you have a much better understanding of packet
> scheduling so please bear with me :)  My target model is that I have
> n_cpus x "n_tc/prio" queues on the PF and I want to transmit the
> fallback traffic over those same queues.  So no new HW queues are used
> for VFRs at all.  This is a reverse of macvlan offload which AFAICT has
> "bastard hw queues" which actually TX for a separate software device.
>
> My understanding was that I can rework this model to have software
> queues for VFRs (#sw queues == #PF queues + #VFRs) but no extra HW
> queues (#hw queues == #PF queues) but then when the driver sees a
> packet on sw-only VFR queue it has to pick one of the PF queues (which
> one?), lock PF software queue to own it, and only then can it
> transmit.  With the dst_metadata there is no need for extra locking or
> queue selection.

Yes.  The VFPR netdevs don't have any HW queues associated with them and 
we would like
to use the PF queues for the xmit.
I was also looking into some way of passing the port id via skb 
parameter to the
dev_queue_xmit() call so that the PF xmit routine can do a directed 
transmit to a specifc VF.
Is skb->cb an option to pass this info?
dst_metadata approach would work  too if it is acceptable.


>
>> To address 'I wonder if the solution can be done without this lower
>> netdev' I think it can be but it creates two issues which I'm not sure
>> have a good solution.
>>
>> Without a lowerdev we either (a) give each VFR its own queue which I
>> don't like because it complicates mgmt and uses resources or (b) we
>> implicitly share queues. The later could be fine it just looks a bit
>> cleaner IMO to make it explicit.
>>
>> With regard to VF-PF flow rules if we allow matching on ingress port
>> then can all your flow rules be pushed through the PF netdevices or
>> if you want any of the VFR netdevs? After all I expsect the flow rule
>> table is actually a shared resource between all attached ports.
> With the VF-PF forwarding rules I was just inching towards re-opening
> the discussion on whether there should be an CPU port netdev.  I guess
> there are good reasons why there isn't so maybe let's not go there :)
> The meaning of PF netdevs in SR-IOV switchdev mode is "external ports"
> AFAICT which could make it cumbersome to reach the host.
Jakub Kicinski Sept. 23, 2016, 8:17 p.m. UTC | #7
On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:
> On 9/23/2016 8:29 AM, Jakub Kicinski wrote:
> > On Fri, 23 Sep 2016 07:23:26 -0700, John Fastabend wrote:  
> >> Yep, I like the idea in general. I had a slightly different approach in
> >> mind though. If you look at __dev_queue_xmit() there is a void
> >> accel_priv pointer (gather you found this based on your commit note).
> >> My take was we could extend this a bit so it can be used by the VFR
> >> devices and they could do a dev_queue_xmit_accel(). In this way there is
> >> no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
> >> accel logic needs to be extended to push the priv pointer all the way
> >> through the xmit routine of the target netdev though. This should look
> >> a lot like the macvlan accelerated xmit device path without the
> >> switching logic.
> >>
> >> Of course maybe the name would be extended to dev_queue_xmit_extended()
> >> or something.
> >>
> >> So the flow on ingress would be,
> >>
> >>    1. pkt_received_by_PF_netdev
> >>    2. PF_netdev reads some tag off packet/descriptor and sets correct
> >>       skb->dev field. This is needed so stack "sees" packets from
> >>       correct VF ports.
> >>    3. packet passed up to stack.
> >>
> >> I guess it is a bit "zombie" like on the receive path because the packet
> >> is never actually handled by VF netdev code per se and on egress can
> >> traverse both the VFR and PF netdevs qdiscs. But on the other hand the
> >> VFR netdevs and PF netdevs are all in the same driver. Plus using a
> >> queue per VFR is a bit of a waste as its not needed and also hardware
> >> may not have any mechanism to push VF traffic onto a rx queue.
> >>
> >> On egress,
> >>
> >>    1. VFR xmit is called
> >>    2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
> >>       for the lower netdev
> >>    3. lower netdev sends out the packet.
> >>
> >> Again we don't need to waste any queues for each VFR and the VFR can be
> >> a LLTX device. In this scheme I think you avoid much of the changes in
> >> your patch and keep it all contained in the driver. Any thoughts?  
> 
> The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
> to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
> queue.
> Also, it is not passed all the way to the driver specific xmit routine.  
> Doesn't it require
> changing all the driver xmit routines if we want to pass this parameter?
> 
> > Goes without saying that you have a much better understanding of packet
> > scheduling so please bear with me :)  My target model is that I have
> > n_cpus x "n_tc/prio" queues on the PF and I want to transmit the
> > fallback traffic over those same queues.  So no new HW queues are used
> > for VFRs at all.  This is a reverse of macvlan offload which AFAICT has
> > "bastard hw queues" which actually TX for a separate software device.
> >
> > My understanding was that I can rework this model to have software
> > queues for VFRs (#sw queues == #PF queues + #VFRs) but no extra HW
> > queues (#hw queues == #PF queues) but then when the driver sees a
> > packet on sw-only VFR queue it has to pick one of the PF queues (which
> > one?), lock PF software queue to own it, and only then can it
> > transmit.  With the dst_metadata there is no need for extra locking or
> > queue selection.  
> 
> Yes.  The VFPR netdevs don't have any HW queues associated with them and 
> we would like
> to use the PF queues for the xmit.
> I was also looking into some way of passing the port id via skb 
> parameter to the
> dev_queue_xmit() call so that the PF xmit routine can do a directed 
> transmit to a specifc VF.
> Is skb->cb an option to pass this info?
> dst_metadata approach would work  too if it is acceptable.

I don't think we can trust skb->cb to be set to anything meaningful
when the skb is received by the lower device.
John Fastabend Sept. 23, 2016, 8:25 p.m. UTC | #8
On 16-09-23 01:17 PM, Jakub Kicinski wrote:
> On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:
>> On 9/23/2016 8:29 AM, Jakub Kicinski wrote:
>>> On Fri, 23 Sep 2016 07:23:26 -0700, John Fastabend wrote:  
>>>> Yep, I like the idea in general. I had a slightly different approach in
>>>> mind though. If you look at __dev_queue_xmit() there is a void
>>>> accel_priv pointer (gather you found this based on your commit note).
>>>> My take was we could extend this a bit so it can be used by the VFR
>>>> devices and they could do a dev_queue_xmit_accel(). In this way there is
>>>> no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
>>>> accel logic needs to be extended to push the priv pointer all the way
>>>> through the xmit routine of the target netdev though. This should look
>>>> a lot like the macvlan accelerated xmit device path without the
>>>> switching logic.
>>>>
>>>> Of course maybe the name would be extended to dev_queue_xmit_extended()
>>>> or something.
>>>>
>>>> So the flow on ingress would be,
>>>>
>>>>    1. pkt_received_by_PF_netdev
>>>>    2. PF_netdev reads some tag off packet/descriptor and sets correct
>>>>       skb->dev field. This is needed so stack "sees" packets from
>>>>       correct VF ports.
>>>>    3. packet passed up to stack.
>>>>
>>>> I guess it is a bit "zombie" like on the receive path because the packet
>>>> is never actually handled by VF netdev code per se and on egress can
>>>> traverse both the VFR and PF netdevs qdiscs. But on the other hand the
>>>> VFR netdevs and PF netdevs are all in the same driver. Plus using a
>>>> queue per VFR is a bit of a waste as its not needed and also hardware
>>>> may not have any mechanism to push VF traffic onto a rx queue.
>>>>
>>>> On egress,
>>>>
>>>>    1. VFR xmit is called
>>>>    2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
>>>>       for the lower netdev
>>>>    3. lower netdev sends out the packet.
>>>>
>>>> Again we don't need to waste any queues for each VFR and the VFR can be
>>>> a LLTX device. In this scheme I think you avoid much of the changes in
>>>> your patch and keep it all contained in the driver. Any thoughts?  
>>
>> The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
>> to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
>> queue.
>> Also, it is not passed all the way to the driver specific xmit routine.  
>> Doesn't it require
>> changing all the driver xmit routines if we want to pass this parameter?
>>
>>> Goes without saying that you have a much better understanding of packet
>>> scheduling so please bear with me :)  My target model is that I have
>>> n_cpus x "n_tc/prio" queues on the PF and I want to transmit the
>>> fallback traffic over those same queues.  So no new HW queues are used
>>> for VFRs at all.  This is a reverse of macvlan offload which AFAICT has
>>> "bastard hw queues" which actually TX for a separate software device.
>>>
>>> My understanding was that I can rework this model to have software
>>> queues for VFRs (#sw queues == #PF queues + #VFRs) but no extra HW
>>> queues (#hw queues == #PF queues) but then when the driver sees a
>>> packet on sw-only VFR queue it has to pick one of the PF queues (which
>>> one?), lock PF software queue to own it, and only then can it
>>> transmit.  With the dst_metadata there is no need for extra locking or
>>> queue selection.  
>>
>> Yes.  The VFPR netdevs don't have any HW queues associated with them and 
>> we would like
>> to use the PF queues for the xmit.
>> I was also looking into some way of passing the port id via skb 
>> parameter to the
>> dev_queue_xmit() call so that the PF xmit routine can do a directed 
>> transmit to a specifc VF.
>> Is skb->cb an option to pass this info?
>> dst_metadata approach would work  too if it is acceptable.
> 
> I don't think we can trust skb->cb to be set to anything meaningful
> when the skb is received by the lower device. 
> 

Agreed. I wouldn't recommend using skb->cb. How about passing it through
dev_queue_xmit_accel() through to the driver?

If you pass the metadata through the dev_queue_xmit_accel() handle tx
queue  selection would work using normal mechanisms (xps, select_queue,
cls  hook, etc.). If you wanted to pick some specific queue based on
policy the policy could be loaded into one of those hooks.

.John
Jakub Kicinski Sept. 23, 2016, 8:45 p.m. UTC | #9
On Fri, 23 Sep 2016 13:25:10 -0700, John Fastabend wrote:
> On 16-09-23 01:17 PM, Jakub Kicinski wrote:
> > On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:  
> >> On 9/23/2016 8:29 AM, Jakub Kicinski wrote:  
>  [...]  
>  [...]  
> >>
> >> The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
> >> to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
> >> queue.
> >> Also, it is not passed all the way to the driver specific xmit routine.  
> >> Doesn't it require
> >> changing all the driver xmit routines if we want to pass this parameter?
> >>  
>  [...]  
> >>
> >> Yes.  The VFPR netdevs don't have any HW queues associated with them and 
> >> we would like
> >> to use the PF queues for the xmit.
> >> I was also looking into some way of passing the port id via skb 
> >> parameter to the
> >> dev_queue_xmit() call so that the PF xmit routine can do a directed 
> >> transmit to a specifc VF.
> >> Is skb->cb an option to pass this info?
> >> dst_metadata approach would work  too if it is acceptable.  
> > 
> > I don't think we can trust skb->cb to be set to anything meaningful
> > when the skb is received by the lower device. 
> 
> Agreed. I wouldn't recommend using skb->cb. How about passing it through
> dev_queue_xmit_accel() through to the driver?
> 
> If you pass the metadata through the dev_queue_xmit_accel() handle tx
> queue  selection would work using normal mechanisms (xps, select_queue,
> cls  hook, etc.). If you wanted to pick some specific queue based on
> policy the policy could be loaded into one of those hooks.

Do you mean without extending how accel is handled by
dev_queue_xmit_accel() today?  If my goal is to not have extra HW
queues then I don't see how I could mux in the lower dev without extra
locking (as I tried to explain two emails ago).  Sorry for being slow
here :(
John Fastabend Sept. 23, 2016, 9:20 p.m. UTC | #10
On 16-09-23 01:45 PM, Jakub Kicinski wrote:
> On Fri, 23 Sep 2016 13:25:10 -0700, John Fastabend wrote:
>> On 16-09-23 01:17 PM, Jakub Kicinski wrote:
>>> On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:  
>>>> On 9/23/2016 8:29 AM, Jakub Kicinski wrote:  
>>  [...]  
>>  [...]  
>>>>
>>>> The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
>>>> to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
>>>> queue.
>>>> Also, it is not passed all the way to the driver specific xmit routine.  
>>>> Doesn't it require
>>>> changing all the driver xmit routines if we want to pass this parameter?
>>>>  
>>  [...]  
>>>>
>>>> Yes.  The VFPR netdevs don't have any HW queues associated with them and 
>>>> we would like
>>>> to use the PF queues for the xmit.
>>>> I was also looking into some way of passing the port id via skb 
>>>> parameter to the
>>>> dev_queue_xmit() call so that the PF xmit routine can do a directed 
>>>> transmit to a specifc VF.
>>>> Is skb->cb an option to pass this info?
>>>> dst_metadata approach would work  too if it is acceptable.  
>>>
>>> I don't think we can trust skb->cb to be set to anything meaningful
>>> when the skb is received by the lower device. 
>>
>> Agreed. I wouldn't recommend using skb->cb. How about passing it through
>> dev_queue_xmit_accel() through to the driver?
>>
>> If you pass the metadata through the dev_queue_xmit_accel() handle tx
>> queue  selection would work using normal mechanisms (xps, select_queue,
>> cls  hook, etc.). If you wanted to pick some specific queue based on
>> policy the policy could be loaded into one of those hooks.
> 
> Do you mean without extending how accel is handled by
> dev_queue_xmit_accel() today?  If my goal is to not have extra HW
> queues then I don't see how I could mux in the lower dev without extra
> locking (as I tried to explain two emails ago).  Sorry for being slow
> here :(
> 

Not slow here I think I was overly optimistic...

Yeh let me try this, roughly the current flow is,

   dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
   __dev_queue_xmit(skb, accel_priv);
   netdev_pick_tx(dev, skb, accel_priv);
	ndo_select_queue(dev, skb, accel_priv, ...);
   [...]
   q->enqueue();
   [...]
   dev_hard_start_xmit();
   [...]
    <driver code here>

So in this flow the VFR netdev driver handles its xmit routine by
calling dev_queue_xmit_accel after setting skb->dev to the physical
device and passing a cookie via accel that the select_queue() routine
can use to pick a tx queue. The rest of the stack q->enqueue() and
friends will ensure that locking and qdisc is handled correctly.

But accel_priv was lost at queue selection and so its not being passed
down to the driver so no way to set your descriptor bits or whatever
needed to push to the VF. I was sort of thinking we could map it from
the select_queue routine but I can't figure out how to do that either.

The metadata idea doesn't seem that bad now that I've spent some more
time going through it. Either that or hijack some field in the skb but
I think that might be worse than the proposal here.

I'm trying to think up some other alternative now and will let you know
if I think of anything clever but got nothing at the moment.

.John
Jakub Kicinski Sept. 29, 2016, 11:10 a.m. UTC | #11
On Fri, 23 Sep 2016 14:20:40 -0700, John Fastabend wrote:
> On 16-09-23 01:45 PM, Jakub Kicinski wrote:
> > On Fri, 23 Sep 2016 13:25:10 -0700, John Fastabend wrote:  
> >> On 16-09-23 01:17 PM, Jakub Kicinski wrote:  
> >>> On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:    
> >>>> On 9/23/2016 8:29 AM, Jakub Kicinski wrote:    
> >>  [...]  
> >>  [...]    
> >>>>
> >>>> The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
> >>>> to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
> >>>> queue.
> >>>> Also, it is not passed all the way to the driver specific xmit routine.  
> >>>> Doesn't it require
> >>>> changing all the driver xmit routines if we want to pass this parameter?
> >>>>    
> >>  [...]    
> >>>>
> >>>> Yes.  The VFPR netdevs don't have any HW queues associated with them and 
> >>>> we would like
> >>>> to use the PF queues for the xmit.
> >>>> I was also looking into some way of passing the port id via skb 
> >>>> parameter to the
> >>>> dev_queue_xmit() call so that the PF xmit routine can do a directed 
> >>>> transmit to a specifc VF.
> >>>> Is skb->cb an option to pass this info?
> >>>> dst_metadata approach would work  too if it is acceptable.    
> >>>
> >>> I don't think we can trust skb->cb to be set to anything meaningful
> >>> when the skb is received by the lower device.   
> >>
> >> Agreed. I wouldn't recommend using skb->cb. How about passing it through
> >> dev_queue_xmit_accel() through to the driver?
> >>
> >> If you pass the metadata through the dev_queue_xmit_accel() handle tx
> >> queue  selection would work using normal mechanisms (xps, select_queue,
> >> cls  hook, etc.). If you wanted to pick some specific queue based on
> >> policy the policy could be loaded into one of those hooks.  
> > 
> > Do you mean without extending how accel is handled by
> > dev_queue_xmit_accel() today?  If my goal is to not have extra HW
> > queues then I don't see how I could mux in the lower dev without extra
> > locking (as I tried to explain two emails ago).  Sorry for being slow
> > here :(
> >   
> 
> Not slow here I think I was overly optimistic...
> 
> Yeh let me try this, roughly the current flow is,
> 
>    dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
>    __dev_queue_xmit(skb, accel_priv);
>    netdev_pick_tx(dev, skb, accel_priv);
> 	ndo_select_queue(dev, skb, accel_priv, ...);
>    [...]
>    q->enqueue();
>    [...]
>    dev_hard_start_xmit();
>    [...]
>     <driver code here>
> 
> So in this flow the VFR netdev driver handles its xmit routine by
> calling dev_queue_xmit_accel after setting skb->dev to the physical
> device and passing a cookie via accel that the select_queue() routine
> can use to pick a tx queue. The rest of the stack q->enqueue() and
> friends will ensure that locking and qdisc is handled correctly.
> 
> But accel_priv was lost at queue selection and so its not being passed
> down to the driver so no way to set your descriptor bits or whatever
> needed to push to the VF. I was sort of thinking we could map it from
> the select_queue routine but I can't figure out how to do that either.
> 
> The metadata idea doesn't seem that bad now that I've spent some more
> time going through it. Either that or hijack some field in the skb but
> I think that might be worse than the proposal here.
> 
> I'm trying to think up some other alternative now and will let you know
> if I think of anything clever but got nothing at the moment.
	
Cool, I'm happy to discuss this further at netdev but it seems like
there is no strong opposition so far?

FWIW in the example I gave I didn't do refcounting on the dst but I
think that's incorrect since we don't have control over lifetime of
redirected/stolen skbs.
diff mbox

Patch

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 6965c8f68ade..6d7e1e4f3acd 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -5,10 +5,22 @@ 
 #include <net/ip_tunnels.h>
 #include <net/dst.h>
 
+enum metadata_type {
+	METADATA_IP_TUNNEL,
+	METADATA_HW_PORT_MUX,
+};
+
+struct hw_port_info {
+	struct netdevice *lower_dev;
+	u32 port_id;
+};
+
 struct metadata_dst {
 	struct dst_entry		dst;
+	enum metadata_type		type;
 	union {
 		struct ip_tunnel_info	tun_info;
+		struct hw_port_info	port_info;
 	} u;
 };
 
@@ -27,7 +39,7 @@  static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
 	struct dst_entry *dst;
 
-	if (md_dst)
+	if (md_dst && md_dst->type == METADATA_IP_TUNNEL)
 		return &md_dst->u.tun_info;
 
 	dst = skb_dst(skb);
@@ -55,7 +67,14 @@  static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
 	a = (const struct metadata_dst *) skb_dst(skb_a);
 	b = (const struct metadata_dst *) skb_dst(skb_b);
 
-	if (!a != !b || a->u.tun_info.options_len != b->u.tun_info.options_len)
+	if (!a != !b || a->type != b->type)
+		return 1;
+
+	if (a->type == METADATA_HW_PORT_MUX)
+		return memcmp(&a->u.port_info, &b->u.port_info,
+			      sizeof(a->u.port_info));
+
+	if (a->u.tun_info.options_len != b->u.tun_info.options_len)
 		return 1;
 
 	return memcmp(&a->u.tun_info, &b->u.tun_info,
@@ -63,14 +82,16 @@  static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
 }
 
 void metadata_dst_free(struct metadata_dst *);
-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
-struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags);
+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
+					gfp_t flags);
+struct metadata_dst __percpu *
+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);
 
 static inline struct metadata_dst *tun_rx_dst(int md_size)
 {
 	struct metadata_dst *tun_dst;
 
-	tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC);
+	tun_dst = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
 	if (!tun_dst)
 		return NULL;
 
@@ -85,11 +106,11 @@  static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
 	int md_size;
 	struct metadata_dst *new_md;
 
-	if (!md_dst)
+	if (!md_dst || md_dst->type != METADATA_IP_TUNNEL)
 		return ERR_PTR(-EINVAL);
 
 	md_size = md_dst->u.tun_info.options_len;
-	new_md = metadata_dst_alloc(md_size, GFP_ATOMIC);
+	new_md = metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, GFP_ATOMIC);
 	if (!new_md)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/net/core/dst.c b/net/core/dst.c
index b5cbbe07f786..dc8c0c0b197b 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -367,7 +367,8 @@  static int dst_md_discard(struct sk_buff *skb)
 	return 0;
 }
 
-static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
+static void __metadata_dst_init(struct metadata_dst *md_dst,
+				enum metadata_type type, u8 optslen)
 {
 	struct dst_entry *dst;
 
@@ -379,9 +380,11 @@  static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen)
 	dst->output = dst_md_discard_out;
 
 	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
+	md_dst->type = type;
 }
 
-struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
+					gfp_t flags)
 {
 	struct metadata_dst *md_dst;
 
@@ -389,7 +392,7 @@  struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
 	if (!md_dst)
 		return NULL;
 
-	__metadata_dst_init(md_dst, optslen);
+	__metadata_dst_init(md_dst, type, optslen);
 
 	return md_dst;
 }
@@ -403,7 +406,8 @@  void metadata_dst_free(struct metadata_dst *md_dst)
 	kfree(md_dst);
 }
 
-struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
+struct metadata_dst __percpu *
+metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
 {
 	int cpu;
 	struct metadata_dst __percpu *md_dst;
@@ -414,7 +418,7 @@  struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
 		return NULL;
 
 	for_each_possible_cpu(cpu)
-		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen);
+		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);
 
 	return md_dst;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 0920c2ac1d00..61536a7e932e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2386,6 +2386,7 @@  bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 		 * that is holding verifier mutex.
 		 */
 		md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+						   METADATA_IP_TUNNEL,
 						   GFP_KERNEL);
 		if (!md_dst)
 			return NULL;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 777bc1883870..12ffbc4a4daa 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -145,10 +145,11 @@  struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 	struct metadata_dst *res;
 	struct ip_tunnel_info *dst, *src;
 
-	if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
+	if (!md || md->type != METADATA_IP_TUNNEL ||
+	    md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
 		return NULL;
 
-	res = metadata_dst_alloc(0, flags);
+	res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags);
 	if (!res)
 		return NULL;
 
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index ae25ded82b3b..c9971701d0af 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2072,7 +2072,8 @@  static int validate_and_copy_set_tun(const struct nlattr *attr,
 	if (start < 0)
 		return start;
 
-	tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL);
+	tun_dst = metadata_dst_alloc(key.tun_opts_len, METADATA_IP_TUNNEL,
+				     GFP_KERNEL);
 	if (!tun_dst)
 		return -ENOMEM;