diff mbox series

[RFC,net-next,v2,2/2] virtio_net: Extend virtio to use VF datapath when available

Message ID 1515736720-39368-3-git-send-email-sridhar.samudrala@intel.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Enable virtio to act as a backup for a passthru device | expand

Commit Message

Samudrala, Sridhar Jan. 12, 2018, 5:58 a.m. UTC
This patch enables virtio_net to switch over to a VF datapath when a VF
netdev is present with the same MAC address. The VF datapath is only used
for unicast traffic. Broadcasts/multicasts go via virtio datapath so that
east-west broadcasts don't use the PCI bandwidth. It allows live migration
of a VM with a direct attached VF without the need to setup a bond/team
between a VF and virtio net device in the guest.

The hypervisor needs to unplug the VF device from the guest on the source
host and reset the MAC filter of the VF to initiate failover of datapath to
virtio before starting the migration. After the migration is completed, the
destination hypervisor sets the MAC filter on the VF and plugs it back to
the guest to switch over to VF datapath.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization&m=151189725224231&w=2

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
---
 drivers/net/virtio_net.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 305 insertions(+), 2 deletions(-)

Comments

Siwei Liu Jan. 22, 2018, 8:27 p.m. UTC | #1
First off, as mentioned in another thread, the model of stacking up
virt-bond functionality over virtio seems a wrong direction to me.
Essentially the migration process would need to carry over all guest
side configurations previously done on the VF/PT and get them moved to
the new device being it virtio or VF/PT. Without the help of a new
upper layer bond driver that enslaves virtio and VF/PT devices
underneath, virtio will be overloaded with too much specifics being a
VF/PT backup in the future. I hope you're already aware of the issue
in longer term and move to that model as soon as possible. See more
inline.

On Thu, Jan 11, 2018 at 9:58 PM, Sridhar Samudrala
<sridhar.samudrala@intel.com> wrote:
> This patch enables virtio_net to switch over to a VF datapath when a VF
> netdev is present with the same MAC address. The VF datapath is only used
> for unicast traffic. Broadcasts/multicasts go via virtio datapath so that
> east-west broadcasts don't use the PCI bandwidth.

Why not making an this an option/optimization rather than being the
only means? The problem of east-west broadcast eating PCI bandwidth
depends on specifics of the (virtual) network setup, while some users
won't want to lose VF's merits such as latency. Why restricting
broadcast/multicast xmit to virtio only which potentially regresses
the performance against raw VF?

> It allows live migration
> of a VM with a direct attached VF without the need to setup a bond/team
> between a VF and virtio net device in the guest.
>
> The hypervisor needs to unplug the VF device from the guest on the source
> host and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed, the
> destination hypervisor sets the MAC filter on the VF and plugs it back to
> the guest to switch over to VF datapath.

Is there a host side patch (planned) for this MAC filter switching
process? As said in another thread, that simple script won't work for
macvtap backend.

Thanks,
-Siwei

>
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> ---
>  drivers/net/virtio_net.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 305 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index f149a160a8c5..0e58d364fde9 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -30,6 +30,8 @@
>  #include <linux/cpu.h>
>  #include <linux/average.h>
>  #include <linux/filter.h>
> +#include <linux/netdevice.h>
> +#include <linux/netpoll.h>
>  #include <net/route.h>
>  #include <net/xdp.h>
>
> @@ -120,6 +122,15 @@ struct receive_queue {
>         struct xdp_rxq_info xdp_rxq;
>  };
>
> +struct virtnet_vf_pcpu_stats {
> +       u64     rx_packets;
> +       u64     rx_bytes;
> +       u64     tx_packets;
> +       u64     tx_bytes;
> +       struct u64_stats_sync   syncp;
> +       u32     tx_dropped;
> +};
> +
>  struct virtnet_info {
>         struct virtio_device *vdev;
>         struct virtqueue *cvq;
> @@ -182,6 +193,10 @@ struct virtnet_info {
>         u32 speed;
>
>         unsigned long guest_offloads;
> +
> +       /* State to manage the associated VF interface. */
> +       struct net_device __rcu *vf_netdev;
> +       struct virtnet_vf_pcpu_stats __percpu *vf_stats;
>  };
>
>  struct padded_vnet_hdr {
> @@ -1314,16 +1329,53 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
>         return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
>  }
>
> +/* Send skb on the slave VF device. */
> +static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev,
> +                          struct sk_buff *skb)
> +{
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       unsigned int len = skb->len;
> +       int rc;
> +
> +       skb->dev = vf_netdev;
> +       skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
> +
> +       rc = dev_queue_xmit(skb);
> +       if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
> +               struct virtnet_vf_pcpu_stats *pcpu_stats
> +                       = this_cpu_ptr(vi->vf_stats);
> +
> +               u64_stats_update_begin(&pcpu_stats->syncp);
> +               pcpu_stats->tx_packets++;
> +               pcpu_stats->tx_bytes += len;
> +               u64_stats_update_end(&pcpu_stats->syncp);
> +       } else {
> +               this_cpu_inc(vi->vf_stats->tx_dropped);
> +       }
> +
> +       return rc;
> +}
> +
>  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>         struct virtnet_info *vi = netdev_priv(dev);
>         int qnum = skb_get_queue_mapping(skb);
>         struct send_queue *sq = &vi->sq[qnum];
> +       struct net_device *vf_netdev;
>         int err;
>         struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>         bool kick = !skb->xmit_more;
>         bool use_napi = sq->napi.weight;
>
> +       /* If VF is present and up then redirect packets
> +        * called with rcu_read_lock_bh
> +        */
> +       vf_netdev = rcu_dereference_bh(vi->vf_netdev);
> +       if (vf_netdev && netif_running(vf_netdev) &&
> +           !netpoll_tx_running(dev) &&
> +           is_unicast_ether_addr(eth_hdr(skb)->h_dest))
> +               return virtnet_vf_xmit(dev, vf_netdev, skb);
> +
>         /* Free up any pending old buffers before queueing new ones. */
>         free_old_xmit_skbs(sq);
>
> @@ -1470,10 +1522,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
>         return ret;
>  }
>
> +static void virtnet_get_vf_stats(struct net_device *dev,
> +                                struct virtnet_vf_pcpu_stats *tot)
> +{
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       int i;
> +
> +       memset(tot, 0, sizeof(*tot));
> +
> +       for_each_possible_cpu(i) {
> +               const struct virtnet_vf_pcpu_stats *stats
> +                               = per_cpu_ptr(vi->vf_stats, i);
> +               u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
> +               unsigned int start;
> +
> +               do {
> +                       start = u64_stats_fetch_begin_irq(&stats->syncp);
> +                       rx_packets = stats->rx_packets;
> +                       tx_packets = stats->tx_packets;
> +                       rx_bytes = stats->rx_bytes;
> +                       tx_bytes = stats->tx_bytes;
> +               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
> +
> +               tot->rx_packets += rx_packets;
> +               tot->tx_packets += tx_packets;
> +               tot->rx_bytes   += rx_bytes;
> +               tot->tx_bytes   += tx_bytes;
> +               tot->tx_dropped += stats->tx_dropped;
> +       }
> +}
> +
>  static void virtnet_stats(struct net_device *dev,
>                           struct rtnl_link_stats64 *tot)
>  {
>         struct virtnet_info *vi = netdev_priv(dev);
> +       struct virtnet_vf_pcpu_stats vf_stats;
>         int cpu;
>         unsigned int start;
>
> @@ -1504,6 +1587,13 @@ static void virtnet_stats(struct net_device *dev,
>         tot->rx_dropped = dev->stats.rx_dropped;
>         tot->rx_length_errors = dev->stats.rx_length_errors;
>         tot->rx_frame_errors = dev->stats.rx_frame_errors;
> +
> +       virtnet_get_vf_stats(dev, &vf_stats);
> +       tot->rx_packets += vf_stats.rx_packets;
> +       tot->tx_packets += vf_stats.tx_packets;
> +       tot->rx_bytes += vf_stats.rx_bytes;
> +       tot->tx_bytes += vf_stats.tx_bytes;
> +       tot->tx_dropped += vf_stats.tx_dropped;
>  }
>
>  #ifdef CONFIG_NET_POLL_CONTROLLER
> @@ -2635,6 +2725,13 @@ static int virtnet_probe(struct virtio_device *vdev)
>
>         INIT_WORK(&vi->config_work, virtnet_config_changed_work);
>
> +       if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) {
> +               vi->vf_stats =
> +                       netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats);
> +               if (!vi->vf_stats)
> +                       goto free_stats;
> +       }
> +
>         /* If we can receive ANY GSO packets, we must allocate large ones. */
>         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
>             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
> @@ -2668,7 +2765,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>                          */
>                         dev_err(&vdev->dev, "device MTU appears to have changed "
>                                 "it is now %d < %d", mtu, dev->min_mtu);
> -                       goto free_stats;
> +                       goto free_vf_stats;
>                 }
>
>                 dev->mtu = mtu;
> @@ -2692,7 +2789,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>         /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
>         err = init_vqs(vi);
>         if (err)
> -               goto free_stats;
> +               goto free_vf_stats;
>
>  #ifdef CONFIG_SYSFS
>         if (vi->mergeable_rx_bufs)
> @@ -2747,6 +2844,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>         cancel_delayed_work_sync(&vi->refill);
>         free_receive_page_frags(vi);
>         virtnet_del_vqs(vi);
> +free_vf_stats:
> +       free_percpu(vi->vf_stats);
>  free_stats:
>         free_percpu(vi->stats);
>  free:
> @@ -2768,19 +2867,184 @@ static void remove_vq_common(struct virtnet_info *vi)
>         virtnet_del_vqs(vi);
>  }
>
> +static struct net_device *get_virtio_bymac(const u8 *mac)
> +{
> +       struct net_device *dev;
> +
> +       ASSERT_RTNL();
> +
> +       for_each_netdev(&init_net, dev) {
> +               if (dev->netdev_ops != &virtnet_netdev)
> +                       continue;       /* not a virtio_net device */
> +
> +               if (ether_addr_equal(mac, dev->perm_addr))
> +                       return dev;
> +       }
> +
> +       return NULL;
> +}
> +
> +static struct net_device *get_virtio_byref(struct net_device *vf_netdev)
> +{
> +       struct net_device *dev;
> +
> +       ASSERT_RTNL();
> +
> +       for_each_netdev(&init_net, dev) {
> +               struct virtnet_info *vi;
> +
> +               if (dev->netdev_ops != &virtnet_netdev)
> +                       continue;       /* not a virtio_net device */
> +
> +               vi = netdev_priv(dev);
> +               if (rtnl_dereference(vi->vf_netdev) == vf_netdev)
> +                       return dev;     /* a match */
> +       }
> +
> +       return NULL;
> +}
> +
> +/* Called when VF is injecting data into network stack.
> + * Change the associated network device from VF to virtio.
> + * note: already called with rcu_read_lock
> + */
> +static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb)
> +{
> +       struct sk_buff *skb = *pskb;
> +       struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
> +       struct virtnet_info *vi = netdev_priv(ndev);
> +       struct virtnet_vf_pcpu_stats *pcpu_stats =
> +                               this_cpu_ptr(vi->vf_stats);
> +
> +       skb->dev = ndev;
> +
> +       u64_stats_update_begin(&pcpu_stats->syncp);
> +       pcpu_stats->rx_packets++;
> +       pcpu_stats->rx_bytes += skb->len;
> +       u64_stats_update_end(&pcpu_stats->syncp);
> +
> +       return RX_HANDLER_ANOTHER;
> +}
> +
> +static int virtnet_vf_join(struct net_device *vf_netdev,
> +                          struct net_device *ndev)
> +{
> +       int ret;
> +
> +       ret = netdev_rx_handler_register(vf_netdev,
> +                                        virtnet_vf_handle_frame, ndev);
> +       if (ret != 0) {
> +               netdev_err(vf_netdev,
> +                          "can not register virtio VF receive handler (err = %d)\n",
> +                          ret);
> +               goto rx_handler_failed;
> +       }
> +
> +       ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
> +       if (ret != 0) {
> +               netdev_err(vf_netdev,
> +                          "can not set master device %s (err = %d)\n",
> +                          ndev->name, ret);
> +               goto upper_link_failed;
> +       }
> +
> +       vf_netdev->flags |= IFF_SLAVE;
> +
> +       /* Align MTU of VF with master */
> +       ret = dev_set_mtu(vf_netdev, ndev->mtu);
> +       if (ret)
> +               netdev_warn(vf_netdev,
> +                           "unable to change mtu to %u\n", ndev->mtu);
> +
> +       call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
> +
> +       netdev_info(vf_netdev, "joined to %s\n", ndev->name);
> +       return 0;
> +
> +upper_link_failed:
> +       netdev_rx_handler_unregister(vf_netdev);
> +rx_handler_failed:
> +       return ret;
> +}
> +
> +static int virtnet_register_vf(struct net_device *vf_netdev)
> +{
> +       struct net_device *ndev;
> +       struct virtnet_info *vi;
> +
> +       if (vf_netdev->addr_len != ETH_ALEN)
> +               return NOTIFY_DONE;
> +
> +       /* We will use the MAC address to locate the virtio_net interface to
> +        * associate with the VF interface. If we don't find a matching
> +        * virtio interface, move on.
> +        */
> +       ndev = get_virtio_bymac(vf_netdev->perm_addr);
> +       if (!ndev)
> +               return NOTIFY_DONE;
> +
> +       vi = netdev_priv(ndev);
> +       if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
> +               return NOTIFY_DONE;
> +
> +       if (rtnl_dereference(vi->vf_netdev))
> +               return NOTIFY_DONE;
> +
> +       if (virtnet_vf_join(vf_netdev, ndev) != 0)
> +               return NOTIFY_DONE;
> +
> +       netdev_info(ndev, "VF registering %s\n", vf_netdev->name);
> +
> +       dev_hold(vf_netdev);
> +       rcu_assign_pointer(vi->vf_netdev, vf_netdev);
> +
> +       return NOTIFY_OK;
> +}
> +
> +static int virtnet_unregister_vf(struct net_device *vf_netdev)
> +{
> +       struct net_device *ndev;
> +       struct virtnet_info *vi;
> +
> +       ndev = get_virtio_byref(vf_netdev);
> +       if (!ndev)
> +               return NOTIFY_DONE;
> +
> +       vi = netdev_priv(ndev);
> +       if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
> +               return NOTIFY_DONE;
> +
> +       netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name);
> +
> +       netdev_rx_handler_unregister(vf_netdev);
> +       netdev_upper_dev_unlink(vf_netdev, ndev);
> +       RCU_INIT_POINTER(vi->vf_netdev, NULL);
> +       dev_put(vf_netdev);
> +
> +       return NOTIFY_OK;
> +}
> +
>  static void virtnet_remove(struct virtio_device *vdev)
>  {
>         struct virtnet_info *vi = vdev->priv;
> +       struct net_device *vf_netdev;
>
>         virtnet_cpu_notif_remove(vi);
>
>         /* Make sure no work handler is accessing the device. */
>         flush_work(&vi->config_work);
>
> +       rtnl_lock();
> +       vf_netdev = rtnl_dereference(vi->vf_netdev);
> +       if (vf_netdev)
> +               virtnet_unregister_vf(vf_netdev);
> +       rtnl_unlock();
> +
>         unregister_netdev(vi->dev);
>
>         remove_vq_common(vi);
>
> +       free_percpu(vi->vf_stats);
>         free_percpu(vi->stats);
>         free_netdev(vi->dev);
>  }
> @@ -2859,6 +3123,42 @@ static struct virtio_driver virtio_net_driver = {
>  #endif
>  };
>
> +static int virtio_netdev_event(struct notifier_block *this,
> +                              unsigned long event, void *ptr)
> +{
> +       struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> +       /* Skip our own events */
> +       if (event_dev->netdev_ops == &virtnet_netdev)
> +               return NOTIFY_DONE;
> +
> +       /* Avoid non-Ethernet type devices */
> +       if (event_dev->type != ARPHRD_ETHER)
> +               return NOTIFY_DONE;
> +
> +       /* Avoid Vlan dev with same MAC registering as VF */
> +       if (is_vlan_dev(event_dev))
> +               return NOTIFY_DONE;
> +
> +       /* Avoid Bonding master dev with same MAC registering as VF */
> +       if ((event_dev->priv_flags & IFF_BONDING) &&
> +           (event_dev->flags & IFF_MASTER))
> +               return NOTIFY_DONE;
> +
> +       switch (event) {
> +       case NETDEV_REGISTER:
> +               return virtnet_register_vf(event_dev);
> +       case NETDEV_UNREGISTER:
> +               return virtnet_unregister_vf(event_dev);
> +       default:
> +               return NOTIFY_DONE;
> +       }
> +}
> +
> +static struct notifier_block virtio_netdev_notifier = {
> +       .notifier_call = virtio_netdev_event,
> +};
> +
>  static __init int virtio_net_driver_init(void)
>  {
>         int ret;
> @@ -2877,6 +3177,8 @@ static __init int virtio_net_driver_init(void)
>          ret = register_virtio_driver(&virtio_net_driver);
>         if (ret)
>                 goto err_virtio;
> +
> +       register_netdevice_notifier(&virtio_netdev_notifier);
>         return 0;
>  err_virtio:
>         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2889,6 +3191,7 @@ module_init(virtio_net_driver_init);
>
>  static __exit void virtio_net_driver_exit(void)
>  {
> +       unregister_netdevice_notifier(&virtio_netdev_notifier);
>         unregister_virtio_driver(&virtio_net_driver);
>         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>         cpuhp_remove_multi_state(virtionet_online);
> --
> 2.14.3
>
Samudrala, Sridhar Jan. 22, 2018, 9:05 p.m. UTC | #2
On 1/22/2018 12:27 PM, Siwei Liu wrote:
> First off, as mentioned in another thread, the model of stacking up
> virt-bond functionality over virtio seems a wrong direction to me.
> Essentially the migration process would need to carry over all guest
> side configurations previously done on the VF/PT and get them moved to
> the new device being it virtio or VF/PT. Without the help of a new
> upper layer bond driver that enslaves virtio and VF/PT devices
> underneath, virtio will be overloaded with too much specifics being a
> VF/PT backup in the future. I hope you're already aware of the issue
> in longer term and move to that model as soon as possible. See more
> inline.

The idea behind this design is to  provide a low latency datapath to 
virtio_net while
preserving live migration feature without the need for the guest admin 
to configure
a bond between VF and virtio_net.
As this feature is enabled and configured via virtio_net which has a 
back channel to
the hypervisor, adding this functionality to virtio_net looks like a 
reasonable option.

Adding a new driver and a new device requires defining a new interface 
and a channel
between the hypervisor and the VM and if required we may implement that in
future.

>
> On Thu, Jan 11, 2018 at 9:58 PM, Sridhar Samudrala
> <sridhar.samudrala@intel.com> wrote:
>> This patch enables virtio_net to switch over to a VF datapath when a VF
>> netdev is present with the same MAC address. The VF datapath is only used
>> for unicast traffic. Broadcasts/multicasts go via virtio datapath so that
>> east-west broadcasts don't use the PCI bandwidth.
> Why not making an this an option/optimization rather than being the
> only means? The problem of east-west broadcast eating PCI bandwidth
> depends on specifics of the (virtual) network setup, while some users
> won't want to lose VF's merits such as latency. Why restricting
> broadcast/multicast xmit to virtio only which potentially regresses
> the performance against raw VF?

I am planning to remove this option when i resubmit the patches.

>
>> It allows live migration
>> of a VM with a direct attached VF without the need to setup a bond/team
>> between a VF and virtio net device in the guest.
>>
>> The hypervisor needs to unplug the VF device from the guest on the source
>> host and reset the MAC filter of the VF to initiate failover of datapath to
>> virtio before starting the migration. After the migration is completed, the
>> destination hypervisor sets the MAC filter on the VF and plugs it back to
>> the guest to switch over to VF datapath.
> Is there a host side patch (planned) for this MAC filter switching
> process? As said in another thread, that simple script won't work for
> macvtap backend.

The host side patch to enable qemu to configure this feature is included 
in this patch
series.
I have been testing this feature using a shell script, but i hope 
someone in the libvirt
community  will extend 'virsh' to handle live migration when this 
feature is supported.

Thanks
Sridhar
Michael S. Tsirkin Jan. 22, 2018, 9:41 p.m. UTC | #3
On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
> First off, as mentioned in another thread, the model of stacking up
> virt-bond functionality over virtio seems a wrong direction to me.
> Essentially the migration process would need to carry over all guest
> side configurations previously done on the VF/PT and get them moved to
> the new device being it virtio or VF/PT.

I might be wrong but I don't see why we should worry about this usecase.
Whoever has a bond configured already has working config for migration.
We are trying to help people who don't, not convert existig users.

> Without the help of a new
> upper layer bond driver that enslaves virtio and VF/PT devices
> underneath, virtio will be overloaded with too much specifics being a
> VF/PT backup in the future.

So this paragraph already includes at least two conflicting
proposals. On the one hand you want a separate device for
the virtual bond, on the other you are saying a separate
driver.

Further, the reason to have a separate *driver* was that
some people wanted to share code with netvsc - and that
one does not create a separate device, which you can't
change without breaking existing configs.

So some people want a fully userspace-configurable switchdev, and that
already exists at some level, and maybe it makes sense to add more
features for performance.

But the point was that some host configurations are very simple,
and it probably makes sense to pass this information to the guest
and have guest act on it directly. Let's not conflate the two.
Jason Wang Jan. 23, 2018, 10:33 a.m. UTC | #4
On 2018年01月12日 13:58, Sridhar Samudrala wrote:
>   static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>   {
>   	struct virtnet_info *vi = netdev_priv(dev);
>   	int qnum = skb_get_queue_mapping(skb);
>   	struct send_queue *sq = &vi->sq[qnum];
> +	struct net_device *vf_netdev;
>   	int err;
>   	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>   	bool kick = !skb->xmit_more;
>   	bool use_napi = sq->napi.weight;
>   
> +	/* If VF is present and up then redirect packets
> +	 * called with rcu_read_lock_bh
> +	 */
> +	vf_netdev = rcu_dereference_bh(vi->vf_netdev);
> +	if (vf_netdev && netif_running(vf_netdev) &&
> +	    !netpoll_tx_running(dev) &&
> +	    is_unicast_ether_addr(eth_hdr(skb)->h_dest))
> +		return virtnet_vf_xmit(dev, vf_netdev, skb);
> +

A question here.

If I read the code correctly, all features were validated against virtio 
instead VF before transmitting. This assumes VF's feature is a superset 
of virtio, does this really work? Do we need to sanitize the feature 
before joining? (e.g at last NETIF_R_GSO_ROBUST needs to be removed).

Thanks
Samudrala, Sridhar Jan. 23, 2018, 4:03 p.m. UTC | #5
On 1/23/2018 2:33 AM, Jason Wang wrote:
>
>
> On 2018年01月12日 13:58, Sridhar Samudrala wrote:
>>   static netdev_tx_t start_xmit(struct sk_buff *skb, struct 
>> net_device *dev)
>>   {
>>       struct virtnet_info *vi = netdev_priv(dev);
>>       int qnum = skb_get_queue_mapping(skb);
>>       struct send_queue *sq = &vi->sq[qnum];
>> +    struct net_device *vf_netdev;
>>       int err;
>>       struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>>       bool kick = !skb->xmit_more;
>>       bool use_napi = sq->napi.weight;
>>   +    /* If VF is present and up then redirect packets
>> +     * called with rcu_read_lock_bh
>> +     */
>> +    vf_netdev = rcu_dereference_bh(vi->vf_netdev);
>> +    if (vf_netdev && netif_running(vf_netdev) &&
>> +        !netpoll_tx_running(dev) &&
>> +        is_unicast_ether_addr(eth_hdr(skb)->h_dest))
>> +        return virtnet_vf_xmit(dev, vf_netdev, skb);
>> +
>
> A question here.
>
> If I read the code correctly, all features were validated against 
> virtio instead VF before transmitting. This assumes VF's feature is a 
> superset of virtio, does this really work? Do we need to sanitize the 
> feature before joining? (e.g at last NETIF_R_GSO_ROBUST needs to be 
> removed).

Actually, virtnet_vf_xmit() calls dev_queue_xmit() after updating 
skb->dev to vf netdev.
So the features get validated against VF features and the right tx queue 
is selected
before the real transmit.

Thanks
Sridhar
Laine Stump Jan. 23, 2018, 7:53 p.m. UTC | #6
On 01/22/2018 04:05 PM, Samudrala, Sridhar wrote:
> 
> I have been testing this feature using a shell script, but i hope
> someone in the libvirt
> community  will extend 'virsh' to handle live migration when this
> feature is supported.

Each virsh command connects to a single instance of libvirtd on one host
(even the virsh migrate command puts the info about the destination host
into the parameters for a command that it sends to libvirtd on the
source host - virsh itself doesn't directly connect to the destination
host), so the idea of doing all three operations:

 1) unplug assigned device from guest on host A
 2) migrate guest to host B
 3) hotplug assigned device to guest on host B

as a single command is out of scope for virsh. Of course management
software that is designed to manage a *cluster* of hosts (e.g. ovirt,
OpenStack) are connected to libvirtd on both the source and destination
host, so it's more reasonable to expect them to support this
functionality with a single command (or button press, or whatever).

The place where libvirt can and should help is in exposing any new flags
(or other related config) that enable the virtio features in libvirt's
domain configuration XML. For that, don't hesitate to send email to
libvir-list@redhat.com, or ask questions in the #virt channel on OFTC.
Cc me on any mail so I'll see it right away.
Siwei Liu Jan. 23, 2018, 8:24 p.m. UTC | #7
On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
>> First off, as mentioned in another thread, the model of stacking up
>> virt-bond functionality over virtio seems a wrong direction to me.
>> Essentially the migration process would need to carry over all guest
>> side configurations previously done on the VF/PT and get them moved to
>> the new device being it virtio or VF/PT.
>
> I might be wrong but I don't see why we should worry about this usecase.
> Whoever has a bond configured already has working config for migration.
> We are trying to help people who don't, not convert existig users.

That has been placed in the view of cloud providers that the imported
images from the store must be able to run unmodified thus no
additional setup script is allowed (just as Stephen mentioned in
another mail). Cloud users don't care about live migration themselves
but the providers are required to implement such automation mechanism
to make this process transparent if at all possible. The user does not
care about the device underneath being VF or not, but they do care
about consistency all across and the resulting performance
acceleration in making VF the prefered datapath. It is not quite
peculiar user cases but IMHO *any* approach proposed for live
migration should be able to persist the state including network config
e.g. as simple as MTU. Actually this requirement has nothing to do
with virtio but our target users are live migration agnostic, being it
tracking DMA through dirty pages, using virtio as the helper, or
whatsoever, the goal of persisting configs across remains same.

>
>> Without the help of a new
>> upper layer bond driver that enslaves virtio and VF/PT devices
>> underneath, virtio will be overloaded with too much specifics being a
>> VF/PT backup in the future.
>
> So this paragraph already includes at least two conflicting
> proposals. On the one hand you want a separate device for
> the virtual bond, on the other you are saying a separate
> driver.

Just to be crystal clear: separate virtual bond device (netdev ops,
not necessarily bus device) for VM migration specifically with a
separate driver.

>
> Further, the reason to have a separate *driver* was that
> some people wanted to share code with netvsc - and that
> one does not create a separate device, which you can't
> change without breaking existing configs.

I'm not sure I understand this statement. netvsc is already another
netdev being created than the enslaved VF netdev, why it bothers? In
the Azure case, the stock image to be imported does not bind to a
specific driver but only MAC address. And people just deal with the
new virt-bond netdev rather than the underlying virtio and VF. And
both these two underlying netdevs should be made invisible to prevent
userspace script from getting them misconfigured IMHO.

A separate driver was for code sharing for sure, only just netvsc but
could be other para-virtual devices floating around: any PV can serve
as the side channel and the backup path for VF/PT. Once we get the new
driver working atop virtio we may define ops and/or protocol needed to
talk to various other PV frontend that may implement the side channel
of its own for datapath switching (e.g. virtio is one of them, Xen PV
frontend can be another). I just don't like to limit the function to
virtio only and we have to duplicate code then it starts to scatter
around all over the places.

I understand right now we start it as simple so it may just be fine
that the initial development activities center around virtio. However,
from cloud provider/vendor perspective I don't see the proposed scheme
limits to virtio only. Any other PV driver which has the plan to
support the same scheme can benefit. The point is that we shouldn't be
limiting the scheme to virtio specifics so early which is hard to have
it promoted to a common driver once we get there.

>
> So some people want a fully userspace-configurable switchdev, and that
> already exists at some level, and maybe it makes sense to add more
> features for performance.
>
> But the point was that some host configurations are very simple,
> and it probably makes sense to pass this information to the guest
> and have guest act on it directly. Let's not conflate the two.

It may be fine to push some of the configurations from host but that
perhaps doesn't cover all the cases: how is it possible for the host
to save all network states and configs done by the guest before
migration. Some of the configs might come from future guest which is
unknown to host. Anyhow the bottom line is that the guest must be able
to act on those configuration request changes automatically without
involving users intervention.

Regards,
-Siwei

>
> --
> MST
Michael S. Tsirkin Jan. 23, 2018, 10:58 p.m. UTC | #8
On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:
> On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
> >> First off, as mentioned in another thread, the model of stacking up
> >> virt-bond functionality over virtio seems a wrong direction to me.
> >> Essentially the migration process would need to carry over all guest
> >> side configurations previously done on the VF/PT and get them moved to
> >> the new device being it virtio or VF/PT.
> >
> > I might be wrong but I don't see why we should worry about this usecase.
> > Whoever has a bond configured already has working config for migration.
> > We are trying to help people who don't, not convert existig users.
> 
> That has been placed in the view of cloud providers that the imported
> images from the store must be able to run unmodified thus no
> additional setup script is allowed (just as Stephen mentioned in
> another mail). Cloud users don't care about live migration themselves
> but the providers are required to implement such automation mechanism
> to make this process transparent if at all possible. The user does not
> care about the device underneath being VF or not, but they do care
> about consistency all across and the resulting performance
> acceleration in making VF the prefered datapath. It is not quite
> peculiar user cases but IMHO *any* approach proposed for live
> migration should be able to persist the state including network config
> e.g. as simple as MTU. Actually this requirement has nothing to do
> with virtio but our target users are live migration agnostic, being it
> tracking DMA through dirty pages, using virtio as the helper, or
> whatsoever, the goal of persisting configs across remains same.

So the patching being discussed here will mostly do exactly that if your
original config was simply a single virtio net device.


What kind of configs do your users have right now?


> >
> >> Without the help of a new
> >> upper layer bond driver that enslaves virtio and VF/PT devices
> >> underneath, virtio will be overloaded with too much specifics being a
> >> VF/PT backup in the future.
> >
> > So this paragraph already includes at least two conflicting
> > proposals. On the one hand you want a separate device for
> > the virtual bond, on the other you are saying a separate
> > driver.
> 
> Just to be crystal clear: separate virtual bond device (netdev ops,
> not necessarily bus device) for VM migration specifically with a
> separate driver.

Okay, but note that any config someone had on a virtio device won't
propagate to that bond.

> >
> > Further, the reason to have a separate *driver* was that
> > some people wanted to share code with netvsc - and that
> > one does not create a separate device, which you can't
> > change without breaking existing configs.
> 
> I'm not sure I understand this statement. netvsc is already another
> netdev being created than the enslaved VF netdev, why it bothers?

Because it shipped, so userspace ABI is frozen.  You can't really add a
netdevice and enslave an existing one without a risk of breaking some
userspace configs.


> In
> the Azure case, the stock image to be imported does not bind to a
> specific driver but only MAC address.

I'll let netvsc developers decide this, on the surface I don't think
it's reasonable to assume everyone only binds to a MAC.


> And people just deal with the
> new virt-bond netdev rather than the underlying virtio and VF. And
> both these two underlying netdevs should be made invisible to prevent
> userspace script from getting them misconfigured IMHO.
> 
> A separate driver was for code sharing for sure, only just netvsc but
> could be other para-virtual devices floating around: any PV can serve
> as the side channel and the backup path for VF/PT. Once we get the new
> driver working atop virtio we may define ops and/or protocol needed to
> talk to various other PV frontend that may implement the side channel
> of its own for datapath switching (e.g. virtio is one of them, Xen PV
> frontend can be another). I just don't like to limit the function to
> virtio only and we have to duplicate code then it starts to scatter
> around all over the places.
> 
> I understand right now we start it as simple so it may just be fine
> that the initial development activities center around virtio. However,
> from cloud provider/vendor perspective I don't see the proposed scheme
> limits to virtio only. Any other PV driver which has the plan to
> support the same scheme can benefit. The point is that we shouldn't be
> limiting the scheme to virtio specifics so early which is hard to have
> it promoted to a common driver once we get there.

The whole idea has been floating around for years. It would always
get being drowned in this kind of "lets try to cover all use-cases"
discussions, and never make progress.
So let's see some working code merged. If it works fine for virtio
and turns out to be a good fit for netvsc, we can share code.


> >
> > So some people want a fully userspace-configurable switchdev, and that
> > already exists at some level, and maybe it makes sense to add more
> > features for performance.
> >
> > But the point was that some host configurations are very simple,
> > and it probably makes sense to pass this information to the guest
> > and have guest act on it directly. Let's not conflate the two.
> 
> It may be fine to push some of the configurations from host but that
> perhaps doesn't cover all the cases: how is it possible for the host
> to save all network states and configs done by the guest before
> migration. Some of the configs might come from future guest which is
> unknown to host. Anyhow the bottom line is that the guest must be able
> to act on those configuration request changes automatically without
> involving users intervention.
> 
> Regards,
> -Siwei

All use-cases are *already* covered by existing kernel APIs.  Just use a
bond, or a bridge, or whatever. It's just that they are so generic and
hard to use, that userspace to do it never surfaced.

So I am interested in some code that handles some simple use-cases
in the kernel, with a simple kernel API.

> >
> > --
> > MST
Siwei Liu Jan. 26, 2018, 8:14 a.m. UTC | #9
On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:
>> On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> > On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
>> >> First off, as mentioned in another thread, the model of stacking up
>> >> virt-bond functionality over virtio seems a wrong direction to me.
>> >> Essentially the migration process would need to carry over all guest
>> >> side configurations previously done on the VF/PT and get them moved to
>> >> the new device being it virtio or VF/PT.
>> >
>> > I might be wrong but I don't see why we should worry about this usecase.
>> > Whoever has a bond configured already has working config for migration.
>> > We are trying to help people who don't, not convert existig users.
>>
>> That has been placed in the view of cloud providers that the imported
>> images from the store must be able to run unmodified thus no
>> additional setup script is allowed (just as Stephen mentioned in
>> another mail). Cloud users don't care about live migration themselves
>> but the providers are required to implement such automation mechanism
>> to make this process transparent if at all possible. The user does not
>> care about the device underneath being VF or not, but they do care
>> about consistency all across and the resulting performance
>> acceleration in making VF the prefered datapath. It is not quite
>> peculiar user cases but IMHO *any* approach proposed for live
>> migration should be able to persist the state including network config
>> e.g. as simple as MTU. Actually this requirement has nothing to do
>> with virtio but our target users are live migration agnostic, being it
>> tracking DMA through dirty pages, using virtio as the helper, or
>> whatsoever, the goal of persisting configs across remains same.
>
> So the patching being discussed here will mostly do exactly that if your
> original config was simply a single virtio net device.
>

True, but I don't see the patch being discussed starts with good
foundation of supporting the same for VF/PT device. That is the core
of the issue.

>
> What kind of configs do your users have right now?

Any configs be it generic or driver specific that the VF/PT device
supports and have been enabled/configured. General network configs
(MAC, IP address, VLAN, MTU, iptables rules), ethtool settings
(hardware offload, # of queues and ring entris, RSC options, rss
rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc
flower offload, just to name a few. As cloud providers we don't limit
users from applying driver specific tuning to the NIC/VF, and
sometimes this is essential to achieving best performance for their
workload. We've seen cases like tuning coalescing parameters for
getting low latency, changing rx-flow-hash function for better VXLAN
throughput, or even adopting quite advanced NIC features such as flow
director or cloud filter. We don't expect users to compromise even a
little bit on these. That is once we turn on live migration for the VF
or pass through devices in the VM, it all takes place under the hood,
users (guest admins, applications) don't have to react upon it or even
notice the change. I should note that the majority of live migrations
take place between machines with completely identical hardware, it's
more critical than necessary to keep the config as-is across the move,
stealth while quiet.

As you see generic bond or bridge cannot suffice the need. That's why
we need a new customized virt bond driver, and tailor it for VM live
migration specifically. Leveraging para-virtual e.g. virtio net device
as the backup path is one thing, tracking through driver config
changes in order to re-config as necessary is another. I would think
without considering the latter, the proposal being discussed is rather
incomplete, and remote to be useful in production.

>
>
>> >
>> >> Without the help of a new
>> >> upper layer bond driver that enslaves virtio and VF/PT devices
>> >> underneath, virtio will be overloaded with too much specifics being a
>> >> VF/PT backup in the future.
>> >
>> > So this paragraph already includes at least two conflicting
>> > proposals. On the one hand you want a separate device for
>> > the virtual bond, on the other you are saying a separate
>> > driver.
>>
>> Just to be crystal clear: separate virtual bond device (netdev ops,
>> not necessarily bus device) for VM migration specifically with a
>> separate driver.
>
> Okay, but note that any config someone had on a virtio device won't
> propagate to that bond.
>
>> >
>> > Further, the reason to have a separate *driver* was that
>> > some people wanted to share code with netvsc - and that
>> > one does not create a separate device, which you can't
>> > change without breaking existing configs.
>>
>> I'm not sure I understand this statement. netvsc is already another
>> netdev being created than the enslaved VF netdev, why it bothers?
>
> Because it shipped, so userspace ABI is frozen.  You can't really add a
> netdevice and enslave an existing one without a risk of breaking some
> userspace configs.
>

I still don't understand this concern. Like said, before this patch
becomes reality, users interact with raw VF interface all the time.
Now this patch introduces a virtio net devive and enslave the VF.
Users have to interact with two interfaces - IP address and friends
configured on the VF will get lost and users have to reconfigure
virtio all over again. But some other configs e.g. ethtool needs to
remain on the VF. How does it guarantee existing configs won't broken?
Appears to me this is nothing different than having both virtio and VF
netdevs enslaved and users operates on the virt-bond interface
directly.

One thing I'd like to point out is the configs are mostly done in the
control plane. It's entirely possible to separate the data and control
paths in the new virt-bond driver: in the data plane, it may bypass
the virt-bond layer and quickly fall through to the data path of
virtio or VF slave; while in the control plane, the virt-bond may
disguise itself as the active slave, delegate the config changes to
the real driver, relay and expose driver config/state to the user. By
doing that the users and userspace applications just interact with one
single interface, the same way they interacted with the VF interface
as before. Users don't have to deal with the other two enslaved
interfaces directly - those automatically enslaved devices should be
made invisible from userspace applications and admins, and/or be
masked out from regular access by existing kernel APIs.

I don't find it a good reason to reject the idea if we can sort out
ways not to break existing ABI or APIs.


>
>> In
>> the Azure case, the stock image to be imported does not bind to a
>> specific driver but only MAC address.
>
> I'll let netvsc developers decide this, on the surface I don't think
> it's reasonable to assume everyone only binds to a MAC.

Sure. The point I wanted to make was that cloud providers are super
elastic in provisioning images - those driver or device specifics
should have been dehydrated from the original images thus make it
flexible enough to deploy to machines with vast varieties of hardware.
Although it's not necessarily the case everyone binds to a MAC, it's
worth taking a look at what the target users are doing and what the
pain points really are and understand what could be done to solve core
problems. Hyper-V netvsc can also benefit once moved to it, I'd
believe.

>
>
>> And people just deal with the
>> new virt-bond netdev rather than the underlying virtio and VF. And
>> both these two underlying netdevs should be made invisible to prevent
>> userspace script from getting them misconfigured IMHO.
>>
>> A separate driver was for code sharing for sure, only just netvsc but
>> could be other para-virtual devices floating around: any PV can serve
>> as the side channel and the backup path for VF/PT. Once we get the new
>> driver working atop virtio we may define ops and/or protocol needed to
>> talk to various other PV frontend that may implement the side channel
>> of its own for datapath switching (e.g. virtio is one of them, Xen PV
>> frontend can be another). I just don't like to limit the function to
>> virtio only and we have to duplicate code then it starts to scatter
>> around all over the places.
>>
>> I understand right now we start it as simple so it may just be fine
>> that the initial development activities center around virtio. However,
>> from cloud provider/vendor perspective I don't see the proposed scheme
>> limits to virtio only. Any other PV driver which has the plan to
>> support the same scheme can benefit. The point is that we shouldn't be
>> limiting the scheme to virtio specifics so early which is hard to have
>> it promoted to a common driver once we get there.
>
> The whole idea has been floating around for years. It would always
> get being drowned in this kind of "lets try to cover all use-cases"
> discussions, and never make progress.
> So let's see some working code merged. If it works fine for virtio
> and turns out to be a good fit for netvsc, we can share code.

I think we at least should start with a separate netdev other than
virtio. That is what we may agree to have to do without comprise I'd
hope.

>
>
>> >
>> > So some people want a fully userspace-configurable switchdev, and that
>> > already exists at some level, and maybe it makes sense to add more
>> > features for performance.
>> >
>> > But the point was that some host configurations are very simple,
>> > and it probably makes sense to pass this information to the guest
>> > and have guest act on it directly. Let's not conflate the two.
>>
>> It may be fine to push some of the configurations from host but that
>> perhaps doesn't cover all the cases: how is it possible for the host
>> to save all network states and configs done by the guest before
>> migration. Some of the configs might come from future guest which is
>> unknown to host. Anyhow the bottom line is that the guest must be able
>> to act on those configuration request changes automatically without
>> involving users intervention.
>>
>> Regards,
>> -Siwei
>
> All use-cases are *already* covered by existing kernel APIs.  Just use a
> bond, or a bridge, or whatever. It's just that they are so generic and
> hard to use, that userspace to do it never surfaced.

As mentioned earlier, for which I cannot stress enough, the existing
generic bond or bridge doesn't work. We need a new net device that
works for live migration specifically and fits it well.

>
> So I am interested in some code that handles some simple use-cases
> in the kernel, with a simple kernel API.

It should be fine, I like simple stuffs too and wouldn't like to make
complications. The concept of hiding auto-managed interfaces is not
new and has even been implemented by other operating systems already.
Not sure if that is your compatibility concern. We start with simple
for sure, but simple != in-expandable then make potential users
impossible to use at all.


Thanks,
-Siwei

>
>> >
>> > --
>> > MST
Samudrala, Sridhar Jan. 26, 2018, 4:51 p.m. UTC | #10
On 1/26/2018 12:14 AM, Siwei Liu wrote:
> On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:
>>> On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>> On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
>>>>> First off, as mentioned in another thread, the model of stacking up
>>>>> virt-bond functionality over virtio seems a wrong direction to me.
>>>>> Essentially the migration process would need to carry over all guest
>>>>> side configurations previously done on the VF/PT and get them moved to
>>>>> the new device being it virtio or VF/PT.
>>>> I might be wrong but I don't see why we should worry about this usecase.
>>>> Whoever has a bond configured already has working config for migration.
>>>> We are trying to help people who don't, not convert existig users.
>>> That has been placed in the view of cloud providers that the imported
>>> images from the store must be able to run unmodified thus no
>>> additional setup script is allowed (just as Stephen mentioned in
>>> another mail). Cloud users don't care about live migration themselves
>>> but the providers are required to implement such automation mechanism
>>> to make this process transparent if at all possible. The user does not
>>> care about the device underneath being VF or not, but they do care
>>> about consistency all across and the resulting performance
>>> acceleration in making VF the prefered datapath. It is not quite
>>> peculiar user cases but IMHO *any* approach proposed for live
>>> migration should be able to persist the state including network config
>>> e.g. as simple as MTU. Actually this requirement has nothing to do
>>> with virtio but our target users are live migration agnostic, being it
>>> tracking DMA through dirty pages, using virtio as the helper, or
>>> whatsoever, the goal of persisting configs across remains same.
>> So the patching being discussed here will mostly do exactly that if your
>> original config was simply a single virtio net device.
>>
> True, but I don't see the patch being discussed starts with good
> foundation of supporting the same for VF/PT device. That is the core
> of the issue.

>> What kind of configs do your users have right now?
> Any configs be it generic or driver specific that the VF/PT device
> supports and have been enabled/configured. General network configs
> (MAC, IP address, VLAN, MTU, iptables rules), ethtool settings
> (hardware offload, # of queues and ring entris, RSC options, rss
> rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc
> flower offload, just to name a few. As cloud providers we don't limit
> users from applying driver specific tuning to the NIC/VF, and
> sometimes this is essential to achieving best performance for their
> workload. We've seen cases like tuning coalescing parameters for
> getting low latency, changing rx-flow-hash function for better VXLAN
> throughput, or even adopting quite advanced NIC features such as flow
> director or cloud filter. We don't expect users to compromise even a
> little bit on these. That is once we turn on live migration for the VF
> or pass through devices in the VM, it all takes place under the hood,
> users (guest admins, applications) don't have to react upon it or even
> notice the change. I should note that the majority of live migrations
> take place between machines with completely identical hardware, it's
> more critical than necessary to keep the config as-is across the move,
> stealth while quiet.

This usecase is much more complicated and different than what this patch 
is trying
to address.  Also your usecase seems to be assuming that source and 
destination
hosts are identical and have the same HW.

It makes it pretty hard to transparently migrate all these settings with 
live
migration when we are looking at a solution that unplugs the VF 
interface from
the host and the VF driver is unloaded.

> As you see generic bond or bridge cannot suffice the need. That's why
> we need a new customized virt bond driver, and tailor it for VM live
> migration specifically. Leveraging para-virtual e.g. virtio net device
> as the backup path is one thing, tracking through driver config
> changes in order to re-config as necessary is another. I would think
> without considering the latter, the proposal being discussed is rather
> incomplete, and remote to be useful in production.
>
>>
>>>>> Without the help of a new
>>>>> upper layer bond driver that enslaves virtio and VF/PT devices
>>>>> underneath, virtio will be overloaded with too much specifics being a
>>>>> VF/PT backup in the future.
>>>> So this paragraph already includes at least two conflicting
>>>> proposals. On the one hand you want a separate device for
>>>> the virtual bond, on the other you are saying a separate
>>>> driver.
>>> Just to be crystal clear: separate virtual bond device (netdev ops,
>>> not necessarily bus device) for VM migration specifically with a
>>> separate driver.
>> Okay, but note that any config someone had on a virtio device won't
>> propagate to that bond.
>>
>>>> Further, the reason to have a separate *driver* was that
>>>> some people wanted to share code with netvsc - and that
>>>> one does not create a separate device, which you can't
>>>> change without breaking existing configs.
>>> I'm not sure I understand this statement. netvsc is already another
>>> netdev being created than the enslaved VF netdev, why it bothers?
>> Because it shipped, so userspace ABI is frozen.  You can't really add a
>> netdevice and enslave an existing one without a risk of breaking some
>> userspace configs.
>>
> I still don't understand this concern. Like said, before this patch
> becomes reality, users interact with raw VF interface all the time.
> Now this patch introduces a virtio net devive and enslave the VF.
> Users have to interact with two interfaces - IP address and friends
> configured on the VF will get lost and users have to reconfigure
> virtio all over again. But some other configs e.g. ethtool needs to
> remain on the VF. How does it guarantee existing configs won't broken?
> Appears to me this is nothing different than having both virtio and VF
> netdevs enslaved and users operates on the virt-bond interface
> directly.

Yes. This patch doesn't transparently provide live migration support to 
existing
network configurations that only use a VF.  In order to get live 
migration support,
for a VF only image, the network configuration has to change.

It provides hypervisor controlled VF acceleration to existing virtio_net 
based network
configurations in a transparent manner.

>
> One thing I'd like to point out is the configs are mostly done in the
> control plane. It's entirely possible to separate the data and control
> paths in the new virt-bond driver: in the data plane, it may bypass
> the virt-bond layer and quickly fall through to the data path of
> virtio or VF slave; while in the control plane, the virt-bond may
> disguise itself as the active slave, delegate the config changes to
> the real driver, relay and expose driver config/state to the user. By
> doing that the users and userspace applications just interact with one
> single interface, the same way they interacted with the VF interface
> as before. Users don't have to deal with the other two enslaved
> interfaces directly - those automatically enslaved devices should be
> made invisible from userspace applications and admins, and/or be
> masked out from regular access by existing kernel APIs.
>
> I don't find it a good reason to reject the idea if we can sort out
> ways not to break existing ABI or APIs.
>
>
>>> In
>>> the Azure case, the stock image to be imported does not bind to a
>>> specific driver but only MAC address.
>> I'll let netvsc developers decide this, on the surface I don't think
>> it's reasonable to assume everyone only binds to a MAC.
> Sure. The point I wanted to make was that cloud providers are super
> elastic in provisioning images - those driver or device specifics
> should have been dehydrated from the original images thus make it
> flexible enough to deploy to machines with vast varieties of hardware.
> Although it's not necessarily the case everyone binds to a MAC, it's
> worth taking a look at what the target users are doing and what the
> pain points really are and understand what could be done to solve core
> problems. Hyper-V netvsc can also benefit once moved to it, I'd
> believe.
>
>>
>>> And people just deal with the
>>> new virt-bond netdev rather than the underlying virtio and VF. And
>>> both these two underlying netdevs should be made invisible to prevent
>>> userspace script from getting them misconfigured IMHO.
>>>
>>> A separate driver was for code sharing for sure, only just netvsc but
>>> could be other para-virtual devices floating around: any PV can serve
>>> as the side channel and the backup path for VF/PT. Once we get the new
>>> driver working atop virtio we may define ops and/or protocol needed to
>>> talk to various other PV frontend that may implement the side channel
>>> of its own for datapath switching (e.g. virtio is one of them, Xen PV
>>> frontend can be another). I just don't like to limit the function to
>>> virtio only and we have to duplicate code then it starts to scatter
>>> around all over the places.
>>>
>>> I understand right now we start it as simple so it may just be fine
>>> that the initial development activities center around virtio. However,
>>> from cloud provider/vendor perspective I don't see the proposed scheme
>>> limits to virtio only. Any other PV driver which has the plan to
>>> support the same scheme can benefit. The point is that we shouldn't be
>>> limiting the scheme to virtio specifics so early which is hard to have
>>> it promoted to a common driver once we get there.
>> The whole idea has been floating around for years. It would always
>> get being drowned in this kind of "lets try to cover all use-cases"
>> discussions, and never make progress.
>> So let's see some working code merged. If it works fine for virtio
>> and turns out to be a good fit for netvsc, we can share code.
> I think we at least should start with a separate netdev other than
> virtio. That is what we may agree to have to do without comprise I'd
> hope.
I think the usecase that you are targeting does require a new para virt 
bond driver
and a new type of netdevice.

For the usecase where the host is doing all the guest network 
tuning/optimizations
and the VM is not expected to do any tuning/optimizations on the VF 
driver directly,
i think the current patch that follows the netvsc model of 2 
netdevs(virtio and vf) should
work fine.
>>
>>>> So some people want a fully userspace-configurable switchdev, and that
>>>> already exists at some level, and maybe it makes sense to add more
>>>> features for performance.
>>>>
>>>> But the point was that some host configurations are very simple,
>>>> and it probably makes sense to pass this information to the guest
>>>> and have guest act on it directly. Let's not conflate the two.
>>> It may be fine to push some of the configurations from host but that
>>> perhaps doesn't cover all the cases: how is it possible for the host
>>> to save all network states and configs done by the guest before
>>> migration. Some of the configs might come from future guest which is
>>> unknown to host. Anyhow the bottom line is that the guest must be able
>>> to act on those configuration request changes automatically without
>>> involving users intervention.
>>>
>>> Regards,
>>> -Siwei
>> All use-cases are *already* covered by existing kernel APIs.  Just use a
>> bond, or a bridge, or whatever. It's just that they are so generic and
>> hard to use, that userspace to do it never surfaced.
> As mentioned earlier, for which I cannot stress enough, the existing
> generic bond or bridge doesn't work. We need a new net device that
> works for live migration specifically and fits it well.
>
>> So I am interested in some code that handles some simple use-cases
>> in the kernel, with a simple kernel API.
> It should be fine, I like simple stuffs too and wouldn't like to make
> complications. The concept of hiding auto-managed interfaces is not
> new and has even been implemented by other operating systems already.
> Not sure if that is your compatibility concern. We start with simple
> for sure, but simple != in-expandable then make potential users
> impossible to use at all.
>
>
> Thanks,
> -Siwei
>
>>>> --
>>>> MST
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
>
Michael S. Tsirkin Jan. 26, 2018, 4:58 p.m. UTC | #11
On Thu, Jan 11, 2018 at 09:58:39PM -0800, Sridhar Samudrala wrote:
> @@ -2859,6 +3123,42 @@ static struct virtio_driver virtio_net_driver = {
>  #endif
>  };
>  
> +static int virtio_netdev_event(struct notifier_block *this,
> +			       unsigned long event, void *ptr)
> +{
> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> +	/* Skip our own events */
> +	if (event_dev->netdev_ops == &virtnet_netdev)
> +		return NOTIFY_DONE;
> +
> +	/* Avoid non-Ethernet type devices */
> +	if (event_dev->type != ARPHRD_ETHER)
> +		return NOTIFY_DONE;
> +
> +	/* Avoid Vlan dev with same MAC registering as VF */
> +	if (is_vlan_dev(event_dev))
> +		return NOTIFY_DONE;
> +
> +	/* Avoid Bonding master dev with same MAC registering as VF */
> +	if ((event_dev->priv_flags & IFF_BONDING) &&
> +	    (event_dev->flags & IFF_MASTER))
> +		return NOTIFY_DONE;
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		return virtnet_register_vf(event_dev);
> +	case NETDEV_UNREGISTER:
> +		return virtnet_unregister_vf(event_dev);
> +	default:
> +		return NOTIFY_DONE;
> +	}
> +}
> +
> +static struct notifier_block virtio_netdev_notifier = {
> +	.notifier_call = virtio_netdev_event,
> +};
> +
>  static __init int virtio_net_driver_init(void)
>  {
>  	int ret;
> @@ -2877,6 +3177,8 @@ static __init int virtio_net_driver_init(void)
>          ret = register_virtio_driver(&virtio_net_driver);
>  	if (ret)
>  		goto err_virtio;
> +
> +	register_netdevice_notifier(&virtio_netdev_notifier);
>  	return 0;
>  err_virtio:
>  	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2889,6 +3191,7 @@ module_init(virtio_net_driver_init);
>  
>  static __exit void virtio_net_driver_exit(void)
>  {
> +	unregister_netdevice_notifier(&virtio_netdev_notifier);
>  	unregister_virtio_driver(&virtio_net_driver);
>  	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>  	cpuhp_remove_multi_state(virtionet_online);

I have a question here: what if PT device driver module loads
and creates a device before virtio?


> -- 
> 2.14.3
Samudrala, Sridhar Jan. 26, 2018, 6:15 p.m. UTC | #12
On 1/26/2018 8:58 AM, Michael S. Tsirkin wrote:
> On Thu, Jan 11, 2018 at 09:58:39PM -0800, Sridhar Samudrala wrote:
>> @@ -2859,6 +3123,42 @@ static struct virtio_driver virtio_net_driver = {
>>   #endif
>>   };
>>   
>> +static int virtio_netdev_event(struct notifier_block *this,
>> +			       unsigned long event, void *ptr)
>> +{
>> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
>> +
>> +	/* Skip our own events */
>> +	if (event_dev->netdev_ops == &virtnet_netdev)
>> +		return NOTIFY_DONE;
>> +
>> +	/* Avoid non-Ethernet type devices */
>> +	if (event_dev->type != ARPHRD_ETHER)
>> +		return NOTIFY_DONE;
>> +
>> +	/* Avoid Vlan dev with same MAC registering as VF */
>> +	if (is_vlan_dev(event_dev))
>> +		return NOTIFY_DONE;
>> +
>> +	/* Avoid Bonding master dev with same MAC registering as VF */
>> +	if ((event_dev->priv_flags & IFF_BONDING) &&
>> +	    (event_dev->flags & IFF_MASTER))
>> +		return NOTIFY_DONE;
>> +
>> +	switch (event) {
>> +	case NETDEV_REGISTER:
>> +		return virtnet_register_vf(event_dev);
>> +	case NETDEV_UNREGISTER:
>> +		return virtnet_unregister_vf(event_dev);
>> +	default:
>> +		return NOTIFY_DONE;
>> +	}
>> +}
>> +
>> +static struct notifier_block virtio_netdev_notifier = {
>> +	.notifier_call = virtio_netdev_event,
>> +};
>> +
>>   static __init int virtio_net_driver_init(void)
>>   {
>>   	int ret;
>> @@ -2877,6 +3177,8 @@ static __init int virtio_net_driver_init(void)
>>           ret = register_virtio_driver(&virtio_net_driver);
>>   	if (ret)
>>   		goto err_virtio;
>> +
>> +	register_netdevice_notifier(&virtio_netdev_notifier);
>>   	return 0;
>>   err_virtio:
>>   	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>> @@ -2889,6 +3191,7 @@ module_init(virtio_net_driver_init);
>>   
>>   static __exit void virtio_net_driver_exit(void)
>>   {
>> +	unregister_netdevice_notifier(&virtio_netdev_notifier);
>>   	unregister_virtio_driver(&virtio_net_driver);
>>   	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>>   	cpuhp_remove_multi_state(virtionet_online);
> I have a question here: what if PT device driver module loads
> and creates a device before virtio?
>
>
Initially i also had this question if we get NETDEV_REGISTER events for 
netdevs
that are already present. But it looks like 
register_netdevice_notifier() will cause
replay of all the registration and up events of existing devices.

/**
  * register_netdevice_notifier - register a network notifier block
  * @nb: notifier
  *
  * Register a notifier to be called when network device events occur.
  * The notifier passed is linked into the kernel structures and must
  * not be reused until it has been unregistered. A negative errno code
  * is returned on a failure.
  *
  * When registered all registration and up events are replayed
  * to the new notifier to allow device to have a race free
  * view of the network device list.
  */

Thanks
Sridhar
Siwei Liu Jan. 26, 2018, 9:46 p.m. UTC | #13
On Fri, Jan 26, 2018 at 8:51 AM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
>
> On 1/26/2018 12:14 AM, Siwei Liu wrote:
>>
>> On Tue, Jan 23, 2018 at 2:58 PM, Michael S. Tsirkin <mst@redhat.com>
>> wrote:
>>>
>>> On Tue, Jan 23, 2018 at 12:24:47PM -0800, Siwei Liu wrote:
>>>>
>>>> On Mon, Jan 22, 2018 at 1:41 PM, Michael S. Tsirkin <mst@redhat.com>
>>>> wrote:
>>>>>
>>>>> On Mon, Jan 22, 2018 at 12:27:14PM -0800, Siwei Liu wrote:
>>>>>>
>>>>>> First off, as mentioned in another thread, the model of stacking up
>>>>>> virt-bond functionality over virtio seems a wrong direction to me.
>>>>>> Essentially the migration process would need to carry over all guest
>>>>>> side configurations previously done on the VF/PT and get them moved to
>>>>>> the new device being it virtio or VF/PT.
>>>>>
>>>>> I might be wrong but I don't see why we should worry about this
>>>>> usecase.
>>>>> Whoever has a bond configured already has working config for migration.
>>>>> We are trying to help people who don't, not convert existig users.
>>>>
>>>> That has been placed in the view of cloud providers that the imported
>>>> images from the store must be able to run unmodified thus no
>>>> additional setup script is allowed (just as Stephen mentioned in
>>>> another mail). Cloud users don't care about live migration themselves
>>>> but the providers are required to implement such automation mechanism
>>>> to make this process transparent if at all possible. The user does not
>>>> care about the device underneath being VF or not, but they do care
>>>> about consistency all across and the resulting performance
>>>> acceleration in making VF the prefered datapath. It is not quite
>>>> peculiar user cases but IMHO *any* approach proposed for live
>>>> migration should be able to persist the state including network config
>>>> e.g. as simple as MTU. Actually this requirement has nothing to do
>>>> with virtio but our target users are live migration agnostic, being it
>>>> tracking DMA through dirty pages, using virtio as the helper, or
>>>> whatsoever, the goal of persisting configs across remains same.
>>>
>>> So the patching being discussed here will mostly do exactly that if your
>>> original config was simply a single virtio net device.
>>>
>> True, but I don't see the patch being discussed starts with good
>> foundation of supporting the same for VF/PT device. That is the core
>> of the issue.
>
>
>>> What kind of configs do your users have right now?
>>
>> Any configs be it generic or driver specific that the VF/PT device
>> supports and have been enabled/configured. General network configs
>> (MAC, IP address, VLAN, MTU, iptables rules), ethtool settings
>> (hardware offload, # of queues and ring entris, RSC options, rss
>> rxfh-indir table, rx-flow-hash, et al) , bpf/XDP program being run, tc
>> flower offload, just to name a few. As cloud providers we don't limit
>> users from applying driver specific tuning to the NIC/VF, and
>> sometimes this is essential to achieving best performance for their
>> workload. We've seen cases like tuning coalescing parameters for
>> getting low latency, changing rx-flow-hash function for better VXLAN
>> throughput, or even adopting quite advanced NIC features such as flow
>> director or cloud filter. We don't expect users to compromise even a
>> little bit on these. That is once we turn on live migration for the VF
>> or pass through devices in the VM, it all takes place under the hood,
>> users (guest admins, applications) don't have to react upon it or even
>> notice the change. I should note that the majority of live migrations
>> take place between machines with completely identical hardware, it's
>> more critical than necessary to keep the config as-is across the move,
>> stealth while quiet.
>
>
> This usecase is much more complicated and different than what this patch is
> trying
> to address.

Yep, it is technically difficult, but as cloud providers we would like
to take actions to address use case for our own if no one else is
willing to do so. However we're not seeking complicated design or
messing up the others such as your use case. As this is the first time
a real patch of the PV failover approach, although having be discussed
for years, posted to the mailing list. All voices suddenly came over,
various parties wish their specific needs added to the todo list, it's
indeed hard to accommodate all at once in the first place. I went
through same tough period of time while I was doing similar work so I
completely understand that. The task is not easy for sure. :)

The attempts I made was trying to consolidate all potential use cases
into one single solution rather than diverge from the very beginning.
It's in the phase of RFC and I don't want to wait expressing our
interest until very late.

>  Also your usecase seems to be assuming that source and
> destination
> hosts are identical and have the same HW.

Not exactly, this will be positioned as an optimization, but it is
crucial to our use case. While for the generic case with non-equal HW,
we can find out the least common denominator and apply those configs
that the new VF or the para virt driver can support without comprising
too much.

>
> It makes it pretty hard to transparently migrate all these settings with
> live
> migration when we are looking at a solution that unplugs the VF interface
> from
> the host and the VF driver is unloaded.
>
>
>> As you see generic bond or bridge cannot suffice the need. That's why
>> we need a new customized virt bond driver, and tailor it for VM live
>> migration specifically. Leveraging para-virtual e.g. virtio net device
>> as the backup path is one thing, tracking through driver config
>> changes in order to re-config as necessary is another. I would think
>> without considering the latter, the proposal being discussed is rather
>> incomplete, and remote to be useful in production.
>>
>>>
>>>>>> Without the help of a new
>>>>>> upper layer bond driver that enslaves virtio and VF/PT devices
>>>>>> underneath, virtio will be overloaded with too much specifics being a
>>>>>> VF/PT backup in the future.
>>>>>
>>>>> So this paragraph already includes at least two conflicting
>>>>> proposals. On the one hand you want a separate device for
>>>>> the virtual bond, on the other you are saying a separate
>>>>> driver.
>>>>
>>>> Just to be crystal clear: separate virtual bond device (netdev ops,
>>>> not necessarily bus device) for VM migration specifically with a
>>>> separate driver.
>>>
>>> Okay, but note that any config someone had on a virtio device won't
>>> propagate to that bond.
>>>
>>>>> Further, the reason to have a separate *driver* was that
>>>>> some people wanted to share code with netvsc - and that
>>>>> one does not create a separate device, which you can't
>>>>> change without breaking existing configs.
>>>>
>>>> I'm not sure I understand this statement. netvsc is already another
>>>> netdev being created than the enslaved VF netdev, why it bothers?
>>>
>>> Because it shipped, so userspace ABI is frozen.  You can't really add a
>>> netdevice and enslave an existing one without a risk of breaking some
>>> userspace configs.
>>>
>> I still don't understand this concern. Like said, before this patch
>> becomes reality, users interact with raw VF interface all the time.
>> Now this patch introduces a virtio net devive and enslave the VF.
>> Users have to interact with two interfaces - IP address and friends
>> configured on the VF will get lost and users have to reconfigure
>> virtio all over again. But some other configs e.g. ethtool needs to
>> remain on the VF. How does it guarantee existing configs won't broken?
>> Appears to me this is nothing different than having both virtio and VF
>> netdevs enslaved and users operates on the virt-bond interface
>> directly.
>
>
> Yes. This patch doesn't transparently provide live migration support to
> existing
> network configurations that only use a VF.  In order to get live migration
> support,
> for a VF only image, the network configuration has to change.
>
> It provides hypervisor controlled VF acceleration to existing virtio_net
> based network
> configurations in a transparent manner.

OK. But will you plan to address the former: transparently provide
support for existing network configuration with separate patches in
future? Say if go with your 2 netdevs approach.

>
>
>>
>> One thing I'd like to point out is the configs are mostly done in the
>> control plane. It's entirely possible to separate the data and control
>> paths in the new virt-bond driver: in the data plane, it may bypass
>> the virt-bond layer and quickly fall through to the data path of
>> virtio or VF slave; while in the control plane, the virt-bond may
>> disguise itself as the active slave, delegate the config changes to
>> the real driver, relay and expose driver config/state to the user. By
>> doing that the users and userspace applications just interact with one
>> single interface, the same way they interacted with the VF interface
>> as before. Users don't have to deal with the other two enslaved
>> interfaces directly - those automatically enslaved devices should be
>> made invisible from userspace applications and admins, and/or be
>> masked out from regular access by existing kernel APIs.
>>
>> I don't find it a good reason to reject the idea if we can sort out
>> ways not to break existing ABI or APIs.
>>
>>
>>>> In
>>>> the Azure case, the stock image to be imported does not bind to a
>>>> specific driver but only MAC address.
>>>
>>> I'll let netvsc developers decide this, on the surface I don't think
>>> it's reasonable to assume everyone only binds to a MAC.
>>
>> Sure. The point I wanted to make was that cloud providers are super
>> elastic in provisioning images - those driver or device specifics
>> should have been dehydrated from the original images thus make it
>> flexible enough to deploy to machines with vast varieties of hardware.
>> Although it's not necessarily the case everyone binds to a MAC, it's
>> worth taking a look at what the target users are doing and what the
>> pain points really are and understand what could be done to solve core
>> problems. Hyper-V netvsc can also benefit once moved to it, I'd
>> believe.
>>
>>>
>>>> And people just deal with the
>>>> new virt-bond netdev rather than the underlying virtio and VF. And
>>>> both these two underlying netdevs should be made invisible to prevent
>>>> userspace script from getting them misconfigured IMHO.
>>>>
>>>> A separate driver was for code sharing for sure, only just netvsc but
>>>> could be other para-virtual devices floating around: any PV can serve
>>>> as the side channel and the backup path for VF/PT. Once we get the new
>>>> driver working atop virtio we may define ops and/or protocol needed to
>>>> talk to various other PV frontend that may implement the side channel
>>>> of its own for datapath switching (e.g. virtio is one of them, Xen PV
>>>> frontend can be another). I just don't like to limit the function to
>>>> virtio only and we have to duplicate code then it starts to scatter
>>>> around all over the places.
>>>>
>>>> I understand right now we start it as simple so it may just be fine
>>>> that the initial development activities center around virtio. However,
>>>> from cloud provider/vendor perspective I don't see the proposed scheme
>>>> limits to virtio only. Any other PV driver which has the plan to
>>>> support the same scheme can benefit. The point is that we shouldn't be
>>>> limiting the scheme to virtio specifics so early which is hard to have
>>>> it promoted to a common driver once we get there.
>>>
>>> The whole idea has been floating around for years. It would always
>>> get being drowned in this kind of "lets try to cover all use-cases"
>>> discussions, and never make progress.
>>> So let's see some working code merged. If it works fine for virtio
>>> and turns out to be a good fit for netvsc, we can share code.
>>
>> I think we at least should start with a separate netdev other than
>> virtio. That is what we may agree to have to do without comprise I'd
>> hope.
>
> I think the usecase that you are targeting does require a new para virt bond
> driver
> and a new type of netdevice.
>
> For the usecase where the host is doing all the guest network
> tuning/optimizations

I'm not particularly sure about this use case for how the host is
capable of doing guest tuning/optimization, since you may be just
passing a pass-through device to the guest. Or are you talking about
VF specifically?

> and the VM is not expected to do any tuning/optimizations on the VF driver
> directly,
> i think the current patch that follows the netvsc model of 2 netdevs(virtio
> and vf) should
> work fine.

OK. For your use case that's fine. But that's too specific scenario
with lots of restrictions IMHO, perhaps very few users will benefit
from it, I'm not sure. If you're unwilling to move towards it, we'd
take this one and come back with a generic solution that is able to
address general use cases for VF/PT live migration .


Regards,
-Siwei

>>>
>>>
>>>>> So some people want a fully userspace-configurable switchdev, and that
>>>>> already exists at some level, and maybe it makes sense to add more
>>>>> features for performance.
>>>>>
>>>>> But the point was that some host configurations are very simple,
>>>>> and it probably makes sense to pass this information to the guest
>>>>> and have guest act on it directly. Let's not conflate the two.
>>>>
>>>> It may be fine to push some of the configurations from host but that
>>>> perhaps doesn't cover all the cases: how is it possible for the host
>>>> to save all network states and configs done by the guest before
>>>> migration. Some of the configs might come from future guest which is
>>>> unknown to host. Anyhow the bottom line is that the guest must be able
>>>> to act on those configuration request changes automatically without
>>>> involving users intervention.
>>>>
>>>> Regards,
>>>> -Siwei
>>>
>>> All use-cases are *already* covered by existing kernel APIs.  Just use a
>>> bond, or a bridge, or whatever. It's just that they are so generic and
>>> hard to use, that userspace to do it never surfaced.
>>
>> As mentioned earlier, for which I cannot stress enough, the existing
>> generic bond or bridge doesn't work. We need a new net device that
>> works for live migration specifically and fits it well.
>>
>>> So I am interested in some code that handles some simple use-cases
>>> in the kernel, with a simple kernel API.
>>
>> It should be fine, I like simple stuffs too and wouldn't like to make
>> complications. The concept of hiding auto-managed interfaces is not
>> new and has even been implemented by other operating systems already.
>> Not sure if that is your compatibility concern. We start with simple
>> for sure, but simple != in-expandable then make potential users
>> impossible to use at all.
>>
>>
>> Thanks,
>> -Siwei
>>
>>>>> --
>>>>> MST
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
>> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
>>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
>
Michael S. Tsirkin Jan. 26, 2018, 10:14 p.m. UTC | #14
On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
> > and the VM is not expected to do any tuning/optimizations on the VF driver
> > directly,
> > i think the current patch that follows the netvsc model of 2 netdevs(virtio
> > and vf) should
> > work fine.
> 
> OK. For your use case that's fine. But that's too specific scenario
> with lots of restrictions IMHO, perhaps very few users will benefit
> from it, I'm not sure. If you're unwilling to move towards it, we'd
> take this one and come back with a generic solution that is able to
> address general use cases for VF/PT live migration .

I think that's a fine approach. Scratch your own itch!  I imagine a very
generic virtio-switchdev providing host routing info to guests could
address lots of usecases. A driver could bind to that one and enslave
arbitrary other devices.  Sounds reasonable.

But given the fundamental idea of a failover was floated at least as
early as 2013, and made 0 progress since precisely because it kept
trying to address more and more features, and given netvsc is already
using the basic solution with some success, I'm not inclined to block
this specific effort waiting for the generic one.
Jakub Kicinski Jan. 26, 2018, 10:47 p.m. UTC | #15
On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
> > > and the VM is not expected to do any tuning/optimizations on the VF driver
> > > directly,
> > > i think the current patch that follows the netvsc model of 2 netdevs(virtio
> > > and vf) should
> > > work fine.  
> > 
> > OK. For your use case that's fine. But that's too specific scenario
> > with lots of restrictions IMHO, perhaps very few users will benefit
> > from it, I'm not sure. If you're unwilling to move towards it, we'd
> > take this one and come back with a generic solution that is able to
> > address general use cases for VF/PT live migration .  
> 
> I think that's a fine approach. Scratch your own itch!  I imagine a very
> generic virtio-switchdev providing host routing info to guests could
> address lots of usecases. A driver could bind to that one and enslave
> arbitrary other devices.  Sounds reasonable.
> 
> But given the fundamental idea of a failover was floated at least as
> early as 2013, and made 0 progress since precisely because it kept
> trying to address more and more features, and given netvsc is already
> using the basic solution with some success, I'm not inclined to block
> this specific effort waiting for the generic one.

I think there is an agreement that the extra netdev will be useful for
more advanced use cases, and is generally preferable.  What is the
argument for not doing that from the start?  If it was made I must have
missed it.  Is it just unwillingness to write the extra 300 lines of
code?  Sounds like a pretty weak argument when adding kernel ABI is at
stake...
Samudrala, Sridhar Jan. 26, 2018, 11:30 p.m. UTC | #16
On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
> On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
>> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
>>>> and the VM is not expected to do any tuning/optimizations on the VF driver
>>>> directly,
>>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
>>>> and vf) should
>>>> work fine.
>>> OK. For your use case that's fine. But that's too specific scenario
>>> with lots of restrictions IMHO, perhaps very few users will benefit
>>> from it, I'm not sure. If you're unwilling to move towards it, we'd
>>> take this one and come back with a generic solution that is able to
>>> address general use cases for VF/PT live migration .
>> I think that's a fine approach. Scratch your own itch!  I imagine a very
>> generic virtio-switchdev providing host routing info to guests could
>> address lots of usecases. A driver could bind to that one and enslave
>> arbitrary other devices.  Sounds reasonable.
>>
>> But given the fundamental idea of a failover was floated at least as
>> early as 2013, and made 0 progress since precisely because it kept
>> trying to address more and more features, and given netvsc is already
>> using the basic solution with some success, I'm not inclined to block
>> this specific effort waiting for the generic one.
> I think there is an agreement that the extra netdev will be useful for
> more advanced use cases, and is generally preferable.  What is the
> argument for not doing that from the start?  If it was made I must have
> missed it.  Is it just unwillingness to write the extra 300 lines of
> code?  Sounds like a pretty weak argument when adding kernel ABI is at
> stake...

I am still not clear on the need for the extra netdev created by 
virtio_net. The only advantage
i can see is that the stats can be broken between VF and virtio 
datapaths compared
to the aggregrated stats on virtio netdev as seen with the 2 netdev 
approach.

With 2 netdev model, any VM image that has a working network 
configuration will transparently get
VF based acceleration without any changes. 3 netdev model breaks this 
configuration starting with the
creation and naming of the 2 devices to udev needing to be aware of 
master and slave virtio-net devices.
Also, from a user experience point of view, loading a virtio-net with 
BACKUP feature
enabled will  now show 2 virtio-net netdevs.

For live migration with advanced usecases that Siwei is suggesting, i 
think we need a new driver
with a new device type that can track the VF specific feature settings 
even when the VF driver is unloaded.

Thanks
Sridhar
Jakub Kicinski Jan. 27, 2018, 2:30 a.m. UTC | #17
On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
> On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
> > On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:  
> >> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:  
> >>>> and the VM is not expected to do any tuning/optimizations on the VF driver
> >>>> directly,
> >>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
> >>>> and vf) should
> >>>> work fine.  
> >>> OK. For your use case that's fine. But that's too specific scenario
> >>> with lots of restrictions IMHO, perhaps very few users will benefit
> >>> from it, I'm not sure. If you're unwilling to move towards it, we'd
> >>> take this one and come back with a generic solution that is able to
> >>> address general use cases for VF/PT live migration .  
> >> I think that's a fine approach. Scratch your own itch!  I imagine a very
> >> generic virtio-switchdev providing host routing info to guests could
> >> address lots of usecases. A driver could bind to that one and enslave
> >> arbitrary other devices.  Sounds reasonable.
> >>
> >> But given the fundamental idea of a failover was floated at least as
> >> early as 2013, and made 0 progress since precisely because it kept
> >> trying to address more and more features, and given netvsc is already
> >> using the basic solution with some success, I'm not inclined to block
> >> this specific effort waiting for the generic one.  
> > I think there is an agreement that the extra netdev will be useful for
> > more advanced use cases, and is generally preferable.  What is the
> > argument for not doing that from the start?  If it was made I must have
> > missed it.  Is it just unwillingness to write the extra 300 lines of
> > code?  Sounds like a pretty weak argument when adding kernel ABI is at
> > stake...  
> 
> I am still not clear on the need for the extra netdev created by 
> virtio_net. The only advantage i can see is that the stats can be
> broken between VF and virtio datapaths compared to the aggregrated
> stats on virtio netdev as seen with the 2 netdev approach.

Maybe you're not convinced but multiple arguments were made.

> With 2 netdev model, any VM image that has a working network 
> configuration will transparently get VF based acceleration without
> any changes. 

Nothing happens transparently.  Things may happen automatically.  The
VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
something it did not use to be.  And configures and reports some
information from the PV (e.g. speed) but PV doesn't pass traffic any
longer.

> 3 netdev model breaks this configuration starting with the creation
> and naming of the 2 devices to udev needing to be aware of master and
> slave virtio-net devices.

I don't understand this comment.  There is one virtio-net device and
one "virtio-bond" netdev.  And user space has to be aware of the special
automatic arrangement anyway, because it can't touch the VF.  It
doesn't make any difference whether it ignores the VF or PV and VF.
It simply can't touch the slaves, no matter how many there are.

> Also, from a user experience point of view, loading a virtio-net with
> BACKUP feature enabled will now show 2 virtio-net netdevs.

One virtio-net and one virtio-bond, which represents what's happening.

> For live migration with advanced usecases that Siwei is suggesting, i 
> think we need a new driver with a new device type that can track the
> VF specific feature settings even when the VF driver is unloaded.
Samudrala, Sridhar Jan. 27, 2018, 5:33 a.m. UTC | #18
On 1/26/2018 6:30 PM, Jakub Kicinski wrote:
> On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
>> On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
>>> On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
>>>> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
>>>>>> and the VM is not expected to do any tuning/optimizations on the VF driver
>>>>>> directly,
>>>>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
>>>>>> and vf) should
>>>>>> work fine.
>>>>> OK. For your use case that's fine. But that's too specific scenario
>>>>> with lots of restrictions IMHO, perhaps very few users will benefit
>>>>> from it, I'm not sure. If you're unwilling to move towards it, we'd
>>>>> take this one and come back with a generic solution that is able to
>>>>> address general use cases for VF/PT live migration .
>>>> I think that's a fine approach. Scratch your own itch!  I imagine a very
>>>> generic virtio-switchdev providing host routing info to guests could
>>>> address lots of usecases. A driver could bind to that one and enslave
>>>> arbitrary other devices.  Sounds reasonable.
>>>>
>>>> But given the fundamental idea of a failover was floated at least as
>>>> early as 2013, and made 0 progress since precisely because it kept
>>>> trying to address more and more features, and given netvsc is already
>>>> using the basic solution with some success, I'm not inclined to block
>>>> this specific effort waiting for the generic one.
>>> I think there is an agreement that the extra netdev will be useful for
>>> more advanced use cases, and is generally preferable.  What is the
>>> argument for not doing that from the start?  If it was made I must have
>>> missed it.  Is it just unwillingness to write the extra 300 lines of
>>> code?  Sounds like a pretty weak argument when adding kernel ABI is at
>>> stake...
>> I am still not clear on the need for the extra netdev created by
>> virtio_net. The only advantage i can see is that the stats can be
>> broken between VF and virtio datapaths compared to the aggregrated
>> stats on virtio netdev as seen with the 2 netdev approach.
> Maybe you're not convinced but multiple arguments were made.
All the arguments seem to either saying that semantically this doesn't 
look like
a bond OR suggesting usecases that this patch is not trying to solve.
This approach should help cloud environments where the guest networking 
is fully
controlled from the hypervisor via the PF driver or via port representor 
when switchdev
mode is enabled. The guest admin is not expected or allowed to make any 
networking
changes from the VM.

>
>> With 2 netdev model, any VM image that has a working network
>> configuration will transparently get VF based acceleration without
>> any changes.
> Nothing happens transparently.  Things may happen automatically.  The
> VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
> something it did not use to be.  And configures and reports some
> information from the PV (e.g. speed) but PV doesn't pass traffic any
> longer.

>> 3 netdev model breaks this configuration starting with the creation
>> and naming of the 2 devices to udev needing to be aware of master and
>> slave virtio-net devices.
> I don't understand this comment.  There is one virtio-net device and
> one "virtio-bond" netdev.  And user space has to be aware of the special
> automatic arrangement anyway, because it can't touch the VF.  It
> doesn't make any difference whether it ignores the VF or PV and VF.
> It simply can't touch the slaves, no matter how many there are.

If the userspace is not expected to touch the slaves, then why do we need to
take extra effort to expose a netdev that is just not really useful.

>
>> Also, from a user experience point of view, loading a virtio-net with
>> BACKUP feature enabled will now show 2 virtio-net netdevs.
> One virtio-net and one virtio-bond, which represents what's happening.
This again assumes that we want to represent a bond setup. Can't we 
treat this
as virtio-net providing an alternate low-latency datapath by taking over 
VF datapath?
>
>> For live migration with advanced usecases that Siwei is suggesting, i
>> think we need a new driver with a new device type that can track the
>> VF specific feature settings even when the VF driver is unloaded.
Jakub Kicinski Jan. 27, 2018, 5:58 a.m. UTC | #19
On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
> >> 3 netdev model breaks this configuration starting with the creation
> >> and naming of the 2 devices to udev needing to be aware of master and
> >> slave virtio-net devices.  
> > I don't understand this comment.  There is one virtio-net device and
> > one "virtio-bond" netdev.  And user space has to be aware of the special
> > automatic arrangement anyway, because it can't touch the VF.  It
> > doesn't make any difference whether it ignores the VF or PV and VF.
> > It simply can't touch the slaves, no matter how many there are.  
> 
> If the userspace is not expected to touch the slaves, then why do we need to
> take extra effort to expose a netdev that is just not really useful.

You said:
"[user space] needs to be aware of master and slave virtio-net devices."

I'm saying:
It has to be aware of the special arrangement whether there is an
explicit bond netdev or not.

> >> Also, from a user experience point of view, loading a virtio-net with
> >> BACKUP feature enabled will now show 2 virtio-net netdevs.  
> > One virtio-net and one virtio-bond, which represents what's happening.  
> This again assumes that we want to represent a bond setup. Can't we 
> treat this
> as virtio-net providing an alternate low-latency datapath by taking over 
> VF datapath?

Bond is just a familiar name, we can call it something else if you
prefer.  The point is there are two data paths which can have
independent low-level settings and a higher level entity with
global settings which represents any path to the outside world.

Hiding low-level netdevs from a lay user requires a generic solution.
Alexander H Duyck Jan. 28, 2018, 5:35 p.m. UTC | #20
On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>> >> 3 netdev model breaks this configuration starting with the creation
>> >> and naming of the 2 devices to udev needing to be aware of master and
>> >> slave virtio-net devices.
>> > I don't understand this comment.  There is one virtio-net device and
>> > one "virtio-bond" netdev.  And user space has to be aware of the special
>> > automatic arrangement anyway, because it can't touch the VF.  It
>> > doesn't make any difference whether it ignores the VF or PV and VF.
>> > It simply can't touch the slaves, no matter how many there are.
>>
>> If the userspace is not expected to touch the slaves, then why do we need to
>> take extra effort to expose a netdev that is just not really useful.
>
> You said:
> "[user space] needs to be aware of master and slave virtio-net devices."
>
> I'm saying:
> It has to be aware of the special arrangement whether there is an
> explicit bond netdev or not.

To clarify here the kernel should be aware that there is a device that
is an aggregate of 2 other devices. It isn't as if we need to insert
the new device "above" the virtio.

I have been doing a bit of messing around with a few ideas and it
seems like it would be better if we could replace the virtio interface
with the virtio-bond, renaming my virt-bond concept to this since it
is now supposedly going to live in the virtio driver, interface, and
then push the original virtio down one layer and call it a
virtio-backup. If I am not mistaken we could assign the same dev
pointer used by the virtio netdev to the virtio-bond, and if we
register it first with the "eth%d" name then udev will assume that the
virtio-bond device is the original virtio and all existing scripts
should just work with that. We then would want to change the name of
the virtio interface with the backup feature bit set, maybe call it
something like bkup-00:00:00 where the 00:00:00 would be the last 3
octets of the MAC address. It should solve the issue of inserting an
interface "above" the virtio by making the virtio-bond become the
virtio. The only limitation is that we will probably need to remove
the back-up if the virtio device is removed, however that was already
a limitation of this solution and others like the netvsc solution
anyway.

>> >> Also, from a user experience point of view, loading a virtio-net with
>> >> BACKUP feature enabled will now show 2 virtio-net netdevs.
>> > One virtio-net and one virtio-bond, which represents what's happening.
>> This again assumes that we want to represent a bond setup. Can't we
>> treat this
>> as virtio-net providing an alternate low-latency datapath by taking over
>> VF datapath?
>
> Bond is just a familiar name, we can call it something else if you
> prefer.  The point is there are two data paths which can have
> independent low-level settings and a higher level entity with
> global settings which represents any path to the outside world.
>
> Hiding low-level netdevs from a lay user requires a generic solution.

The last thing I think we should be doing is hiding the low level
netdevs. It doesn't solve anythying. We are already trusting the owner
of the VM enough to let them have root access of the VM. That means
they can load/unload any driver they want. They don't have to use the
kernel provided virtio driver, they could load their own. They could
even do something like run DPDK on top of it, or they could run DPDK
on top of the VF. In either case there is no way the bond would ever
be created and all hiding devices does is make it easier to fix
problems when something gets broken. Unless I am mistaken, and
"security through obscurity" has somehow become a valid security
model.

As I mentioned to Sridhar on an off-list thread I think the goal
should be to make it so that the user wants to use the virtio-bond,
not make it so that they have no choice but to use it. The idea is we
should be making things easier for the administrator of the VM, not
harder.

- Alex
Samudrala, Sridhar Jan. 28, 2018, 7:18 p.m. UTC | #21
On 1/28/2018 9:35 AM, Alexander Duyck wrote:
> On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>>>>> 3 netdev model breaks this configuration starting with the creation
>>>>> and naming of the 2 devices to udev needing to be aware of master and
>>>>> slave virtio-net devices.
>>>> I don't understand this comment.  There is one virtio-net device and
>>>> one "virtio-bond" netdev.  And user space has to be aware of the special
>>>> automatic arrangement anyway, because it can't touch the VF.  It
>>>> doesn't make any difference whether it ignores the VF or PV and VF.
>>>> It simply can't touch the slaves, no matter how many there are.
>>> If the userspace is not expected to touch the slaves, then why do we need to
>>> take extra effort to expose a netdev that is just not really useful.
>> You said:
>> "[user space] needs to be aware of master and slave virtio-net devices."
>>
>> I'm saying:
>> It has to be aware of the special arrangement whether there is an
>> explicit bond netdev or not.
> To clarify here the kernel should be aware that there is a device that
> is an aggregate of 2 other devices. It isn't as if we need to insert
> the new device "above" the virtio.
>
> I have been doing a bit of messing around with a few ideas and it
> seems like it would be better if we could replace the virtio interface
> with the virtio-bond, renaming my virt-bond concept to this since it
> is now supposedly going to live in the virtio driver, interface, and
> then push the original virtio down one layer and call it a
> virtio-backup. If I am not mistaken we could assign the same dev
> pointer used by the virtio netdev to the virtio-bond, and if we
> register it first with the "eth%d" name then udev will assume that the
> virtio-bond device is the original virtio and all existing scripts
> should just work with that. We then would want to change the name of
> the virtio interface with the backup feature bit set, maybe call it
> something like bkup-00:00:00 where the 00:00:00 would be the last 3
> octets of the MAC address. It should solve the issue of inserting an
> interface "above" the virtio by making the virtio-bond become the
> virtio. The only limitation is that we will probably need to remove
> the back-up if the virtio device is removed, however that was already
> a limitation of this solution and others like the netvsc solution
> anyway.

With 3 netdev model, if we make the the master virtio-net associated 
with the
real virtio pci device,  i think  the userspace scripts would not break.
If we go this route, i am still not clear on the purpose of exposing the 
bkup netdev.
Can we start with the 2 netdev model and move to 3 netdev model later if we
find out that there are limitiations with the 2 netdev model? I don't 
think this will
break any user API as the userspace is not expected to use the bkup netdev.

>
>>>>> Also, from a user experience point of view, loading a virtio-net with
>>>>> BACKUP feature enabled will now show 2 virtio-net netdevs.
>>>> One virtio-net and one virtio-bond, which represents what's happening.
>>> This again assumes that we want to represent a bond setup. Can't we
>>> treat this
>>> as virtio-net providing an alternate low-latency datapath by taking over
>>> VF datapath?
>> Bond is just a familiar name, we can call it something else if you
>> prefer.  The point is there are two data paths which can have
>> independent low-level settings and a higher level entity with
>> global settings which represents any path to the outside world.
>>
>> Hiding low-level netdevs from a lay user requires a generic solution.
> The last thing I think we should be doing is hiding the low level
> netdevs. It doesn't solve anythying. We are already trusting the owner
> of the VM enough to let them have root access of the VM. That means
> they can load/unload any driver they want. They don't have to use the
> kernel provided virtio driver, they could load their own. They could
> even do something like run DPDK on top of it, or they could run DPDK
> on top of the VF. In either case there is no way the bond would ever
> be created and all hiding devices does is make it easier to fix
> problems when something gets broken. Unless I am mistaken, and
> "security through obscurity" has somehow become a valid security
> model.
>
> As I mentioned to Sridhar on an off-list thread I think the goal
> should be to make it so that the user wants to use the virtio-bond,
> not make it so that they have no choice but to use it. The idea is we
> should be making things easier for the administrator of the VM, not
> harder.
>
>
When the hypervisor has enabled BACKUP bit along with a VF with the same
MAC address, the VM can use either VF only OR extended virtio with 
attached  VF.
Although there are 2 datapaths to the device, the hypervisor will enable 
only
one at any time.  The virtio via PF datapath is only enabled during live 
migration
when the VF is unplugged. If not, VF broadcasts/multicasts will get 
looped back
to the VM via the PF datapath. In the RFC path i was assuming that the 
VM can
use both datapaths and i had broadcasts/multicasts going over virtio 
datapath,
but i don' think it is a good idea.  It requires the device to disable 
broadcast
replication on the VFs.

Thanks
Sridhar
Alexander H Duyck Jan. 28, 2018, 8:18 p.m. UTC | #22
On Sun, Jan 28, 2018 at 11:18 AM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
> On 1/28/2018 9:35 AM, Alexander Duyck wrote:
>>
>> On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>
>>> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>>>>>>
>>>>>> 3 netdev model breaks this configuration starting with the creation
>>>>>> and naming of the 2 devices to udev needing to be aware of master and
>>>>>> slave virtio-net devices.
>>>>>
>>>>> I don't understand this comment.  There is one virtio-net device and
>>>>> one "virtio-bond" netdev.  And user space has to be aware of the
>>>>> special
>>>>> automatic arrangement anyway, because it can't touch the VF.  It
>>>>> doesn't make any difference whether it ignores the VF or PV and VF.
>>>>> It simply can't touch the slaves, no matter how many there are.
>>>>
>>>> If the userspace is not expected to touch the slaves, then why do we
>>>> need to
>>>> take extra effort to expose a netdev that is just not really useful.
>>>
>>> You said:
>>> "[user space] needs to be aware of master and slave virtio-net devices."
>>>
>>> I'm saying:
>>> It has to be aware of the special arrangement whether there is an
>>> explicit bond netdev or not.
>>
>> To clarify here the kernel should be aware that there is a device that
>> is an aggregate of 2 other devices. It isn't as if we need to insert
>> the new device "above" the virtio.
>>
>> I have been doing a bit of messing around with a few ideas and it
>> seems like it would be better if we could replace the virtio interface
>> with the virtio-bond, renaming my virt-bond concept to this since it
>> is now supposedly going to live in the virtio driver, interface, and
>> then push the original virtio down one layer and call it a
>> virtio-backup. If I am not mistaken we could assign the same dev
>> pointer used by the virtio netdev to the virtio-bond, and if we
>> register it first with the "eth%d" name then udev will assume that the
>> virtio-bond device is the original virtio and all existing scripts
>> should just work with that. We then would want to change the name of
>> the virtio interface with the backup feature bit set, maybe call it
>> something like bkup-00:00:00 where the 00:00:00 would be the last 3
>> octets of the MAC address. It should solve the issue of inserting an
>> interface "above" the virtio by making the virtio-bond become the
>> virtio. The only limitation is that we will probably need to remove
>> the back-up if the virtio device is removed, however that was already
>> a limitation of this solution and others like the netvsc solution
>> anyway.
>
>
> With 3 netdev model, if we make the the master virtio-net associated with
> the
> real virtio pci device,  i think  the userspace scripts would not break.
> If we go this route, i am still not clear on the purpose of exposing the
> bkup netdev.
> Can we start with the 2 netdev model and move to 3 netdev model later if we
> find out that there are limitiations with the 2 netdev model? I don't think
> this will
> break any user API as the userspace is not expected to use the bkup netdev.

The 2 netdev model breaks a large number of expectations of user
space. Among them is XDP since we cannot guarantee a symmetric setup
between any entity and the virtio. How does it make sense that
enabling XDP on virtio shows zero Rx packets, and in the meantime you
are getting all of the packets coming in off of the VF?

In addition we would need to rewrite the VLAN and MAC address
filtering ndo operations since we likely cannot add any VLANs since in
most cases VFs are VLAN locked due to things like port VLAN and we
cannot change the MAC address since the whole bonding concept is built
around it.

The last bit is the netpoll packet routing which the current code
assumes is using the virtio only, but I don't know if that is a valid
assumption since the VF is expected to be the default route for
everything else when it is available.

The point is by the time you are done you will have rewritten pretty
much all the network device ops. With that being the case why add all
the code to virtio itself instead of just coming up with a brand new
set of ndo_ops that belong to this new device, and you could leave the
existing virtio code in place and save yourself a bunch of time by
just accessing it as an existing call as a separate netdev.

>>>>>> Also, from a user experience point of view, loading a virtio-net with
>>>>>> BACKUP feature enabled will now show 2 virtio-net netdevs.
>>>>>
>>>>> One virtio-net and one virtio-bond, which represents what's happening.
>>>>
>>>> This again assumes that we want to represent a bond setup. Can't we
>>>> treat this
>>>> as virtio-net providing an alternate low-latency datapath by taking over
>>>> VF datapath?
>>>
>>> Bond is just a familiar name, we can call it something else if you
>>> prefer.  The point is there are two data paths which can have
>>> independent low-level settings and a higher level entity with
>>> global settings which represents any path to the outside world.
>>>
>>> Hiding low-level netdevs from a lay user requires a generic solution.
>>
>> The last thing I think we should be doing is hiding the low level
>> netdevs. It doesn't solve anythying. We are already trusting the owner
>> of the VM enough to let them have root access of the VM. That means
>> they can load/unload any driver they want. They don't have to use the
>> kernel provided virtio driver, they could load their own. They could
>> even do something like run DPDK on top of it, or they could run DPDK
>> on top of the VF. In either case there is no way the bond would ever
>> be created and all hiding devices does is make it easier to fix
>> problems when something gets broken. Unless I am mistaken, and
>> "security through obscurity" has somehow become a valid security
>> model.
>>
>> As I mentioned to Sridhar on an off-list thread I think the goal
>> should be to make it so that the user wants to use the virtio-bond,
>> not make it so that they have no choice but to use it. The idea is we
>> should be making things easier for the administrator of the VM, not
>> harder.
>>
>>
> When the hypervisor has enabled BACKUP bit along with a VF with the same
> MAC address, the VM can use either VF only OR extended virtio with attached
> VF.
> Although there are 2 datapaths to the device, the hypervisor will enable
> only
> one at any time.  The virtio via PF datapath is only enabled during live
> migration
> when the VF is unplugged. If not, VF broadcasts/multicasts will get looped
> back
> to the VM via the PF datapath. In the RFC path i was assuming that the VM
> can
> use both datapaths and i had broadcasts/multicasts going over virtio
> datapath,
> but i don' think it is a good idea.  It requires the device to disable
> broadcast
> replication on the VFs.
>
> Thanks
> Sridhar

This would work well for the SwitchDev model, but not so well for the
legacy model supported by the Intel drivers. With SwitchDev the
broadcasts/multicasts are coming in on the upllink port representor
anyway is my understanding of how some of the implementations out
there are configured.

For now though we can't assume the logic in the path. We can do that
later when we start looking at v2 which would likely have us spawn a
new device that acts as some sort of smart PCI bridge/PCIe switch that
can then have that device assigned to the virtio-bond/bridge and can
hopefully start taking care of things like providing topology/switch
information and maybe incorporate some DMA tracking on the VF which
would probably be v3.

In the meantime for now I would say we keep this simple. We create a
virtio-bond device that does a bit of identity theft on the existing
virtio by mapping the same device, pushes the existing virtio to a
different name to hide it from udev to avoid naming confusion. We make
the device as dumb as can be and don't let it do any L2 configuration
such as assigning a MAC address or VLAN filter, we could probably flag
it as VLAN challenged. The XDP problem solves itself by us not
exposing any bpf or XDP operations. Then all that is left is dealing
with netpoll which if needed we can probably borrow the existing
approach from the bonding driver to solve.

Anyway that is the direction I see this going in.

Thanks.

- Alex
Samudrala, Sridhar Jan. 28, 2018, 9:01 p.m. UTC | #23
On 1/28/2018 12:18 PM, Alexander Duyck wrote:
> On Sun, Jan 28, 2018 at 11:18 AM, Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>> On 1/28/2018 9:35 AM, Alexander Duyck wrote:
>>> On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>>>>>>> 3 netdev model breaks this configuration starting with the creation
>>>>>>> and naming of the 2 devices to udev needing to be aware of master and
>>>>>>> slave virtio-net devices.
>>>>>> I don't understand this comment.  There is one virtio-net device and
>>>>>> one "virtio-bond" netdev.  And user space has to be aware of the
>>>>>> special
>>>>>> automatic arrangement anyway, because it can't touch the VF.  It
>>>>>> doesn't make any difference whether it ignores the VF or PV and VF.
>>>>>> It simply can't touch the slaves, no matter how many there are.
>>>>> If the userspace is not expected to touch the slaves, then why do we
>>>>> need to
>>>>> take extra effort to expose a netdev that is just not really useful.
>>>> You said:
>>>> "[user space] needs to be aware of master and slave virtio-net devices."
>>>>
>>>> I'm saying:
>>>> It has to be aware of the special arrangement whether there is an
>>>> explicit bond netdev or not.
>>> To clarify here the kernel should be aware that there is a device that
>>> is an aggregate of 2 other devices. It isn't as if we need to insert
>>> the new device "above" the virtio.
>>>
>>> I have been doing a bit of messing around with a few ideas and it
>>> seems like it would be better if we could replace the virtio interface
>>> with the virtio-bond, renaming my virt-bond concept to this since it
>>> is now supposedly going to live in the virtio driver, interface, and
>>> then push the original virtio down one layer and call it a
>>> virtio-backup. If I am not mistaken we could assign the same dev
>>> pointer used by the virtio netdev to the virtio-bond, and if we
>>> register it first with the "eth%d" name then udev will assume that the
>>> virtio-bond device is the original virtio and all existing scripts
>>> should just work with that. We then would want to change the name of
>>> the virtio interface with the backup feature bit set, maybe call it
>>> something like bkup-00:00:00 where the 00:00:00 would be the last 3
>>> octets of the MAC address. It should solve the issue of inserting an
>>> interface "above" the virtio by making the virtio-bond become the
>>> virtio. The only limitation is that we will probably need to remove
>>> the back-up if the virtio device is removed, however that was already
>>> a limitation of this solution and others like the netvsc solution
>>> anyway.
>>
>> With 3 netdev model, if we make the the master virtio-net associated with
>> the
>> real virtio pci device,  i think  the userspace scripts would not break.
>> If we go this route, i am still not clear on the purpose of exposing the
>> bkup netdev.
>> Can we start with the 2 netdev model and move to 3 netdev model later if we
>> find out that there are limitiations with the 2 netdev model? I don't think
>> this will
>> break any user API as the userspace is not expected to use the bkup netdev.
> The 2 netdev model breaks a large number of expectations of user
> space. Among them is XDP since we cannot guarantee a symmetric setup
> between any entity and the virtio. How does it make sense that
> enabling XDP on virtio shows zero Rx packets, and in the meantime you
> are getting all of the packets coming in off of the VF?
Sure we cannot support XDP in this model and it needs to be disabled.
>
> In addition we would need to rewrite the VLAN and MAC address
> filtering ndo operations since we likely cannot add any VLANs since in
> most cases VFs are VLAN locked due to things like port VLAN and we
> cannot change the MAC address since the whole bonding concept is built
> around it.
>
> The last bit is the netpoll packet routing which the current code
> assumes is using the virtio only, but I don't know if that is a valid
> assumption since the VF is expected to be the default route for
> everything else when it is available.
>
> The point is by the time you are done you will have rewritten pretty
> much all the network device ops. With that being the case why add all
> the code to virtio itself instead of just coming up with a brand new
> set of ndo_ops that belong to this new device, and you could leave the
> existing virtio code in place and save yourself a bunch of time by
> just accessing it as an existing call as a separate netdev.

When the BACKUP feature is enabled, we can simply disable most of these 
ndo ops
that cannot be supported. Not sure we need an additional netdev and ndo_ops.

When we can support all these usecases along with live migration we can move
to the 3 netdev model and i think we will need a new feature bit so that the
hypervisor can allow VM to use both datapaths and configure PF accordingly.

Thanks
Sridhar
Stephen Hemminger Jan. 28, 2018, 11:02 p.m. UTC | #24
On Fri, 26 Jan 2018 18:30:03 -0800
Jakub Kicinski <kubakici@wp.pl> wrote:

> On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
> > On 1/26/2018 2:47 PM, Jakub Kicinski wrote:  
> > > On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:    
> > >> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:    
> > >>>> and the VM is not expected to do any tuning/optimizations on the VF driver
> > >>>> directly,
> > >>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
> > >>>> and vf) should
> > >>>> work fine.    
> > >>> OK. For your use case that's fine. But that's too specific scenario
> > >>> with lots of restrictions IMHO, perhaps very few users will benefit
> > >>> from it, I'm not sure. If you're unwilling to move towards it, we'd
> > >>> take this one and come back with a generic solution that is able to
> > >>> address general use cases for VF/PT live migration .    
> > >> I think that's a fine approach. Scratch your own itch!  I imagine a very
> > >> generic virtio-switchdev providing host routing info to guests could
> > >> address lots of usecases. A driver could bind to that one and enslave
> > >> arbitrary other devices.  Sounds reasonable.
> > >>
> > >> But given the fundamental idea of a failover was floated at least as
> > >> early as 2013, and made 0 progress since precisely because it kept
> > >> trying to address more and more features, and given netvsc is already
> > >> using the basic solution with some success, I'm not inclined to block
> > >> this specific effort waiting for the generic one.    
> > > I think there is an agreement that the extra netdev will be useful for
> > > more advanced use cases, and is generally preferable.  What is the
> > > argument for not doing that from the start?  If it was made I must have
> > > missed it.  Is it just unwillingness to write the extra 300 lines of
> > > code?  Sounds like a pretty weak argument when adding kernel ABI is at
> > > stake...    
> > 
> > I am still not clear on the need for the extra netdev created by 
> > virtio_net. The only advantage i can see is that the stats can be
> > broken between VF and virtio datapaths compared to the aggregrated
> > stats on virtio netdev as seen with the 2 netdev approach.  
> 
> Maybe you're not convinced but multiple arguments were made.
> 
> > With 2 netdev model, any VM image that has a working network 
> > configuration will transparently get VF based acceleration without
> > any changes.   
> 
> Nothing happens transparently.  Things may happen automatically.  The
> VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
> something it did not use to be.  And configures and reports some
> information from the PV (e.g. speed) but PV doesn't pass traffic any
> longer.
> 
> > 3 netdev model breaks this configuration starting with the creation
> > and naming of the 2 devices to udev needing to be aware of master and
> > slave virtio-net devices.  
> 
> I don't understand this comment.  There is one virtio-net device and
> one "virtio-bond" netdev.  And user space has to be aware of the special
> automatic arrangement anyway, because it can't touch the VF.  It
> doesn't make any difference whether it ignores the VF or PV and VF.
> It simply can't touch the slaves, no matter how many there are.
> 
> > Also, from a user experience point of view, loading a virtio-net with
> > BACKUP feature enabled will now show 2 virtio-net netdevs.  
> 
> One virtio-net and one virtio-bond, which represents what's happening.
> 
> > For live migration with advanced usecases that Siwei is suggesting, i 
> > think we need a new driver with a new device type that can track the
> > VF specific feature settings even when the VF driver is unloaded.  

I see no added value of the 3 netdev model, there is no need for a bond
device.
Alexander H Duyck Jan. 29, 2018, 12:58 a.m. UTC | #25
On Sun, Jan 28, 2018 at 1:01 PM, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
>
> On 1/28/2018 12:18 PM, Alexander Duyck wrote:
>>
>> On Sun, Jan 28, 2018 at 11:18 AM, Samudrala, Sridhar
>> <sridhar.samudrala@intel.com> wrote:
>>>
>>> On 1/28/2018 9:35 AM, Alexander Duyck wrote:
>>>>
>>>> On Fri, Jan 26, 2018 at 9:58 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
>>>>>
>>>>> On Fri, 26 Jan 2018 21:33:01 -0800, Samudrala, Sridhar wrote:
>>>>>>>>
>>>>>>>> 3 netdev model breaks this configuration starting with the creation
>>>>>>>> and naming of the 2 devices to udev needing to be aware of master
>>>>>>>> and
>>>>>>>> slave virtio-net devices.
>>>>>>>
>>>>>>> I don't understand this comment.  There is one virtio-net device and
>>>>>>> one "virtio-bond" netdev.  And user space has to be aware of the
>>>>>>> special
>>>>>>> automatic arrangement anyway, because it can't touch the VF.  It
>>>>>>> doesn't make any difference whether it ignores the VF or PV and VF.
>>>>>>> It simply can't touch the slaves, no matter how many there are.
>>>>>>
>>>>>> If the userspace is not expected to touch the slaves, then why do we
>>>>>> need to
>>>>>> take extra effort to expose a netdev that is just not really useful.
>>>>>
>>>>> You said:
>>>>> "[user space] needs to be aware of master and slave virtio-net
>>>>> devices."
>>>>>
>>>>> I'm saying:
>>>>> It has to be aware of the special arrangement whether there is an
>>>>> explicit bond netdev or not.
>>>>
>>>> To clarify here the kernel should be aware that there is a device that
>>>> is an aggregate of 2 other devices. It isn't as if we need to insert
>>>> the new device "above" the virtio.
>>>>
>>>> I have been doing a bit of messing around with a few ideas and it
>>>> seems like it would be better if we could replace the virtio interface
>>>> with the virtio-bond, renaming my virt-bond concept to this since it
>>>> is now supposedly going to live in the virtio driver, interface, and
>>>> then push the original virtio down one layer and call it a
>>>> virtio-backup. If I am not mistaken we could assign the same dev
>>>> pointer used by the virtio netdev to the virtio-bond, and if we
>>>> register it first with the "eth%d" name then udev will assume that the
>>>> virtio-bond device is the original virtio and all existing scripts
>>>> should just work with that. We then would want to change the name of
>>>> the virtio interface with the backup feature bit set, maybe call it
>>>> something like bkup-00:00:00 where the 00:00:00 would be the last 3
>>>> octets of the MAC address. It should solve the issue of inserting an
>>>> interface "above" the virtio by making the virtio-bond become the
>>>> virtio. The only limitation is that we will probably need to remove
>>>> the back-up if the virtio device is removed, however that was already
>>>> a limitation of this solution and others like the netvsc solution
>>>> anyway.
>>>
>>>
>>> With 3 netdev model, if we make the the master virtio-net associated with
>>> the
>>> real virtio pci device,  i think  the userspace scripts would not break.
>>> If we go this route, i am still not clear on the purpose of exposing the
>>> bkup netdev.
>>> Can we start with the 2 netdev model and move to 3 netdev model later if
>>> we
>>> find out that there are limitiations with the 2 netdev model? I don't
>>> think
>>> this will
>>> break any user API as the userspace is not expected to use the bkup
>>> netdev.
>>
>> The 2 netdev model breaks a large number of expectations of user
>> space. Among them is XDP since we cannot guarantee a symmetric setup
>> between any entity and the virtio. How does it make sense that
>> enabling XDP on virtio shows zero Rx packets, and in the meantime you
>> are getting all of the packets coming in off of the VF?
>
> Sure we cannot support XDP in this model and it needs to be disabled.
>>
>>
>> In addition we would need to rewrite the VLAN and MAC address
>> filtering ndo operations since we likely cannot add any VLANs since in
>> most cases VFs are VLAN locked due to things like port VLAN and we
>> cannot change the MAC address since the whole bonding concept is built
>> around it.
>>
>> The last bit is the netpoll packet routing which the current code
>> assumes is using the virtio only, but I don't know if that is a valid
>> assumption since the VF is expected to be the default route for
>> everything else when it is available.
>>
>> The point is by the time you are done you will have rewritten pretty
>> much all the network device ops. With that being the case why add all
>> the code to virtio itself instead of just coming up with a brand new
>> set of ndo_ops that belong to this new device, and you could leave the
>> existing virtio code in place and save yourself a bunch of time by
>> just accessing it as an existing call as a separate netdev.
>
>
> When the BACKUP feature is enabled, we can simply disable most of these ndo
> ops
> that cannot be supported. Not sure we need an additional netdev and ndo_ops.

The point is feature bits have to be changed, and network
device/ethtool ops have to be rewritten. At that point you have
essentially forked the virtio interface. What I don't understand is
what your resistance is to doing this as 3 netdevs? I thought the
original issue was the fact that the bond-like interface would come up
with a different name/ID. I think we have a solution for that at this
point. I think the bigger issue is that this 2 netdev approach is far
to invasive to be done in the virtio driver itself and would require
massive changes to it. It makes more sense to just put together a
small new netdev that basically does the "dumb-bond" solution.

> When we can support all these usecases along with live migration we can move
> to the 3 netdev model and i think we will need a new feature bit so that the
> hypervisor can allow VM to use both datapaths and configure PF accordingly.

Right now you aren't supporting any of the use cases you claim to. The
code as it stands is full of corner cases. I have already pointed out
several holes in the code that were overlooked that are going to sink
the whole concept if we were to try to push something like this into
production.

On Thursday I tried seeing how hard it would be to make bonding fit
into the model we need. The bonding driver is carrying a TON of extra
bloat and technical debt that we don't need at this point since it has
all the MII/ARP code, sysfs, procfs, debugfs, notifiers, and modes
that we don't want. Trying to cut out that code and make something
that would be usable for this is probably going to be too expensive in
the long run. In the meantime though I think I have developed an
appreciation of where both solutions are currently lacking.

I think implementing another netdev at this point is the only way to
go forward, otherwise what is going to happen is that virtio is going
to quickly bloat up with a bunch of code that has to disable almost
everything in it if BACKUP is enabled as a feature. So here are the
list of netdev ops for virtio:
        .ndo_open            = virtnet_open,
        .ndo_stop            = virtnet_close,

I assume these first 2 will need some changes. I suspect there would
be threads responsible for doing things like maintaining the link
statistics that need to be collected periodically. Also I don't know
if you want to keep the slaves up if the master is up or not. In
addition I don't see any code in your current solution that calls
dev_open. I am assuming something was probably bringing up the
interface for you and then you enslaved it instead of the virtio
interface bringing up the interface itself.

        .ndo_start_xmit      = start_xmit,

Your original patch included a small set of changes for this function.
One thing you appear to have overlooked was the fact that
qdisc_skb_cb(skb)->slave_dev_queue_mapping relies on ndo_select_queue.
It means you are missing at least one new ndo_op that is not currently
provided by virtio. In addition we may want to look at providing a
change of the queue mapping for the combined interface since the VF
may support more queues than the virtio currently does. In my mind we
probably don't even need a qdisc for the upper level device anyway. We
could probably improve our performance if we could make the device not
have a qdisc and even more if we could make it run with LLTX since the
lower device has both covered for us already.

        .ndo_validate_addr   = eth_validate_addr,
        .ndo_set_mac_address = virtnet_set_mac_address,
        .ndo_set_rx_mode     = virtnet_set_rx_mode,

I already mentioned most of the L2 address/filtering code would
probably have to go since the VF drivers aren't usually allowed to do
things like go into full promiscuous or change their MAC address
anyway, and we don't want the address of a MAC address based bond to
be changed.

        .ndo_get_stats64     = virtnet_stats,

You already modified this in your first rev of the patch.  Your
solution is okay I suppose, though it might be more interesting if you
could just provide a merged version of the stats of the 2 interfaces
instead of just capturing some of the Tx/Rx stats.

        .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,

These can both go since we could assume the combined interface is more
than likely "VLAN challenged" due to the fact that the primary use
case is VF migration. Otherwise we need to be able to guarantee that
any VLAN added to the interface is present on both the VF and the
virtio. Since that is extra work just calling the interface "VLAN
challenged" for now is easier.

#ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller = virtnet_netpoll,
#endif

The polling logic needs to be updated so that the polling follows the
Tx packet path instead of just assuming everything is running through
virtio since a backup interface may or may not be capable of
transmitting packets when not in use.

        .ndo_bpf                = virtnet_xdp,
        .ndo_xdp_xmit           = virtnet_xdp_xmit,
        .ndo_xdp_flush          = virtnet_xdp_flush,

This whole block would have to be dropped for the combined interface.
That way we will at least get symmetric behavior between a VF and the
virtio data paths.

        .ndo_features_check     = passthru_features_check,

This part can mostly stay, but as I said the feature bits are going to
have to significantly change for the interface that is sitting on top
of the VF and virtio since the feature functionality likely differs
quite a bit.



I would go through the ethtool ops, but the fact is most of them just
don't apply. All bond implements is the get_drvinfo, get_link, and
get_link_ksettings and that is probably all that would be needed for
the virtio-bond interface.



Hopefully that clarifies things. In my mind the 2 netdev approach
would be the approach to consider for later. For now it is probably
easier to just have the virtio-bond, virtio-backup, and the VF all
present as it makes it easier to get away with just reusing existing
code since all the needed ops are exposed by the netdevs. Hiding them
is going to make things more difficult for debugging so I am strongly
opposed to hiding things.

I realize I told you I was willing to disagree and commit on this, but
after spending a full day reviewing the solution proposed and spending
some time crawling through the bonding code to get a better grasp of
things I am going to have to say the 2 netdev approach just isn't
feasible. We need to have 3 separate netdevs exposing 3 separate sets
of ndo/ethtool ops in order for things to work correctly. Anything
else is going to end up severely impacting features and/or
performance.

- Alex
Jason Wang Jan. 29, 2018, 3:32 a.m. UTC | #26
On 2018年01月24日 00:03, Samudrala, Sridhar wrote:
>
>
> On 1/23/2018 2:33 AM, Jason Wang wrote:
>>
>>
>> On 2018年01月12日 13:58, Sridhar Samudrala wrote:
>>>   static netdev_tx_t start_xmit(struct sk_buff *skb, struct 
>>> net_device *dev)
>>>   {
>>>       struct virtnet_info *vi = netdev_priv(dev);
>>>       int qnum = skb_get_queue_mapping(skb);
>>>       struct send_queue *sq = &vi->sq[qnum];
>>> +    struct net_device *vf_netdev;
>>>       int err;
>>>       struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>>>       bool kick = !skb->xmit_more;
>>>       bool use_napi = sq->napi.weight;
>>>   +    /* If VF is present and up then redirect packets
>>> +     * called with rcu_read_lock_bh
>>> +     */
>>> +    vf_netdev = rcu_dereference_bh(vi->vf_netdev);
>>> +    if (vf_netdev && netif_running(vf_netdev) &&
>>> +        !netpoll_tx_running(dev) &&
>>> +        is_unicast_ether_addr(eth_hdr(skb)->h_dest))
>>> +        return virtnet_vf_xmit(dev, vf_netdev, skb);
>>> +
>>
>> A question here.
>>
>> If I read the code correctly, all features were validated against 
>> virtio instead VF before transmitting. This assumes VF's feature is a 
>> superset of virtio, does this really work? Do we need to sanitize the 
>> feature before joining? (e.g at last NETIF_R_GSO_ROBUST needs to be 
>> removed).
>
> Actually, virtnet_vf_xmit() calls dev_queue_xmit() after updating 
> skb->dev to vf netdev.
> So the features get validated against VF features and the right tx 
> queue is selected
> before the real transmit.

I see, the the packet will go through qdiscs twice which is suboptimal. 
And the device may lose some ability of VF like tunnel GSO.

Thanks

>
> Thanks
> Sridhar
>
>
>
Alexander H Duyck Jan. 29, 2018, 4:26 a.m. UTC | #27
On Sun, Jan 28, 2018 at 3:02 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Fri, 26 Jan 2018 18:30:03 -0800
> Jakub Kicinski <kubakici@wp.pl> wrote:
>
>> On Fri, 26 Jan 2018 15:30:35 -0800, Samudrala, Sridhar wrote:
>> > On 1/26/2018 2:47 PM, Jakub Kicinski wrote:
>> > > On Sat, 27 Jan 2018 00:14:20 +0200, Michael S. Tsirkin wrote:
>> > >> On Fri, Jan 26, 2018 at 01:46:42PM -0800, Siwei Liu wrote:
>> > >>>> and the VM is not expected to do any tuning/optimizations on the VF driver
>> > >>>> directly,
>> > >>>> i think the current patch that follows the netvsc model of 2 netdevs(virtio
>> > >>>> and vf) should
>> > >>>> work fine.
>> > >>> OK. For your use case that's fine. But that's too specific scenario
>> > >>> with lots of restrictions IMHO, perhaps very few users will benefit
>> > >>> from it, I'm not sure. If you're unwilling to move towards it, we'd
>> > >>> take this one and come back with a generic solution that is able to
>> > >>> address general use cases for VF/PT live migration .
>> > >> I think that's a fine approach. Scratch your own itch!  I imagine a very
>> > >> generic virtio-switchdev providing host routing info to guests could
>> > >> address lots of usecases. A driver could bind to that one and enslave
>> > >> arbitrary other devices.  Sounds reasonable.
>> > >>
>> > >> But given the fundamental idea of a failover was floated at least as
>> > >> early as 2013, and made 0 progress since precisely because it kept
>> > >> trying to address more and more features, and given netvsc is already
>> > >> using the basic solution with some success, I'm not inclined to block
>> > >> this specific effort waiting for the generic one.
>> > > I think there is an agreement that the extra netdev will be useful for
>> > > more advanced use cases, and is generally preferable.  What is the
>> > > argument for not doing that from the start?  If it was made I must have
>> > > missed it.  Is it just unwillingness to write the extra 300 lines of
>> > > code?  Sounds like a pretty weak argument when adding kernel ABI is at
>> > > stake...
>> >
>> > I am still not clear on the need for the extra netdev created by
>> > virtio_net. The only advantage i can see is that the stats can be
>> > broken between VF and virtio datapaths compared to the aggregrated
>> > stats on virtio netdev as seen with the 2 netdev approach.
>>
>> Maybe you're not convinced but multiple arguments were made.
>>
>> > With 2 netdev model, any VM image that has a working network
>> > configuration will transparently get VF based acceleration without
>> > any changes.
>>
>> Nothing happens transparently.  Things may happen automatically.  The
>> VF netdev doesn't disappear with netvsc.  The PV netdev transforms into
>> something it did not use to be.  And configures and reports some
>> information from the PV (e.g. speed) but PV doesn't pass traffic any
>> longer.
>>
>> > 3 netdev model breaks this configuration starting with the creation
>> > and naming of the 2 devices to udev needing to be aware of master and
>> > slave virtio-net devices.
>>
>> I don't understand this comment.  There is one virtio-net device and
>> one "virtio-bond" netdev.  And user space has to be aware of the special
>> automatic arrangement anyway, because it can't touch the VF.  It
>> doesn't make any difference whether it ignores the VF or PV and VF.
>> It simply can't touch the slaves, no matter how many there are.
>>
>> > Also, from a user experience point of view, loading a virtio-net with
>> > BACKUP feature enabled will now show 2 virtio-net netdevs.
>>
>> One virtio-net and one virtio-bond, which represents what's happening.
>>
>> > For live migration with advanced usecases that Siwei is suggesting, i
>> > think we need a new driver with a new device type that can track the
>> > VF specific feature settings even when the VF driver is unloaded.
>
> I see no added value of the 3 netdev model, there is no need for a bond
> device.

I agree a full-blown bond isn't what is needed. However, just forking
traffic out from virtio to a VF doesn't really solve things either.

One of the issues as I see it is the fact that the qdisc model with
the merged device gets to be pretty ugly. There is the fact that every
packet that goes to the VF has to pass through the qdisc code twice.
There is the dual nature of the 2 netdev solution that also introduces
the potential for head-of-line blocking since the virtio could put
back pressure on the upper qdisc layer which could stall the VF
traffic when switching over. I hope we could avoid issues like that by
maintaining qdiscs per device queue instead of on an upper device that
is half software interface and half not. Ideally the virtio-bond
device could operate without a qdisc and without needing any
additional locks so there shouldn't be head of line blocking occurring
between the two interfaces and overhead could be kept minimal.

Also in the case of virtio there is support for in-driver XDP. As
Sridhar stated, when using the 2 netdev model "we cannot support XDP
in this model and it needs to be disabled". That sounds like a step
backwards instead of forwards. I would much rather leave the XDP
enabled at the lower dev level, and then if we want we can use the
generic XDP at the virtio-bond level to capture traffic on both
interfaces at the same time.

In the case of netvsc you have control of both sides of a given link
so you can match up the RSS tables, queue configuration and everything
is somewhat symmetric since you are running the PF and all the HyperV
subchannels. Most of the complexity is pushed down into the host and
your subchannel management is synchronized there if I am not mistaken.
We don't have this in the case of this virtio-bond setup. Instead a
single bit is set indicating "backup" without indicating what that
means to topology other than the fact that this virtio interface is
the backup for some other interface. We are essentially blind other
than having the link status for the VF and virtio and knowing that the
virtio is intended to be the backup.

We might be able to get to a 2 or maybe even a 1 netdev solution at
some point in the future, but  I don't think that time is now. For now
a virtio-bond type solution would allow us to address most of the use
cases with minimal modification to the existing virtio and can deal
with feature and/or resource asymmetry.

- Alex
Michael S. Tsirkin Jan. 29, 2018, 6:24 p.m. UTC | #28
On Sun, Jan 28, 2018 at 08:26:53PM -0800, Alexander Duyck wrote:
> >> > For live migration with advanced usecases that Siwei is suggesting, i
> >> > think we need a new driver with a new device type that can track the
> >> > VF specific feature settings even when the VF driver is unloaded.
> >
> > I see no added value of the 3 netdev model, there is no need for a bond
> > device.
> 
> I agree a full-blown bond isn't what is needed. However, just forking
> traffic out from virtio to a VF doesn't really solve things either.
> 
> One of the issues as I see it is the fact that the qdisc model with
> the merged device gets to be pretty ugly. There is the fact that every
> packet that goes to the VF has to pass through the qdisc code twice.
> There is the dual nature of the 2 netdev solution that also introduces
> the potential for head-of-line blocking since the virtio could put
> back pressure on the upper qdisc layer which could stall the VF
> traffic when switching over. I hope we could avoid issues like that by
> maintaining qdiscs per device queue instead of on an upper device that
> is half software interface and half not. Ideally the virtio-bond
> device could operate without a qdisc and without needing any
> additional locks so there shouldn't be head of line blocking occurring
> between the two interfaces and overhead could be kept minimal.
> 
> Also in the case of virtio there is support for in-driver XDP. As
> Sridhar stated, when using the 2 netdev model "we cannot support XDP
> in this model and it needs to be disabled". That sounds like a step
> backwards instead of forwards. I would much rather leave the XDP
> enabled at the lower dev level, and then if we want we can use the
> generic XDP at the virtio-bond level to capture traffic on both
> interfaces at the same time.

I agree dropping XDP makes everything very iffy.

> In the case of netvsc you have control of both sides of a given link
> so you can match up the RSS tables, queue configuration and everything
> is somewhat symmetric since you are running the PF and all the HyperV
> subchannels. Most of the complexity is pushed down into the host and
> your subchannel management is synchronized there if I am not mistaken.
> We don't have this in the case of this virtio-bond setup. Instead a
> single bit is set indicating "backup" without indicating what that
> means to topology other than the fact that this virtio interface is
> the backup for some other interface. We are essentially blind other
> than having the link status for the VF and virtio and knowing that the
> virtio is intended to be the backup.

Would you be interested in posting at least a proof of concept
patch for this approach? It's OK if there are some TODO stubs.
It would be much easier to compare code to code rather than
a high level description to code.

> We might be able to get to a 2 or maybe even a 1 netdev solution at
> some point in the future, but  I don't think that time is now. For now
> a virtio-bond type solution would allow us to address most of the use
> cases with minimal modification to the existing virtio and can deal
> with feature and/or resource asymmetry.
> 
> - Alex
Alexander H Duyck Jan. 29, 2018, 8:09 p.m. UTC | #29
On Mon, Jan 29, 2018 at 10:24 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Jan 28, 2018 at 08:26:53PM -0800, Alexander Duyck wrote:
>> >> > For live migration with advanced usecases that Siwei is suggesting, i
>> >> > think we need a new driver with a new device type that can track the
>> >> > VF specific feature settings even when the VF driver is unloaded.
>> >
>> > I see no added value of the 3 netdev model, there is no need for a bond
>> > device.
>>
>> I agree a full-blown bond isn't what is needed. However, just forking
>> traffic out from virtio to a VF doesn't really solve things either.
>>
>> One of the issues as I see it is the fact that the qdisc model with
>> the merged device gets to be pretty ugly. There is the fact that every
>> packet that goes to the VF has to pass through the qdisc code twice.
>> There is the dual nature of the 2 netdev solution that also introduces
>> the potential for head-of-line blocking since the virtio could put
>> back pressure on the upper qdisc layer which could stall the VF
>> traffic when switching over. I hope we could avoid issues like that by
>> maintaining qdiscs per device queue instead of on an upper device that
>> is half software interface and half not. Ideally the virtio-bond
>> device could operate without a qdisc and without needing any
>> additional locks so there shouldn't be head of line blocking occurring
>> between the two interfaces and overhead could be kept minimal.
>>
>> Also in the case of virtio there is support for in-driver XDP. As
>> Sridhar stated, when using the 2 netdev model "we cannot support XDP
>> in this model and it needs to be disabled". That sounds like a step
>> backwards instead of forwards. I would much rather leave the XDP
>> enabled at the lower dev level, and then if we want we can use the
>> generic XDP at the virtio-bond level to capture traffic on both
>> interfaces at the same time.
>
> I agree dropping XDP makes everything very iffy.
>
>> In the case of netvsc you have control of both sides of a given link
>> so you can match up the RSS tables, queue configuration and everything
>> is somewhat symmetric since you are running the PF and all the HyperV
>> subchannels. Most of the complexity is pushed down into the host and
>> your subchannel management is synchronized there if I am not mistaken.
>> We don't have this in the case of this virtio-bond setup. Instead a
>> single bit is set indicating "backup" without indicating what that
>> means to topology other than the fact that this virtio interface is
>> the backup for some other interface. We are essentially blind other
>> than having the link status for the VF and virtio and knowing that the
>> virtio is intended to be the backup.
>
> Would you be interested in posting at least a proof of concept
> patch for this approach? It's OK if there are some TODO stubs.
> It would be much easier to compare code to code rather than
> a high level description to code.

That is the general idea. I was hoping to go the bonding route last
week but I got too snarled up trying to untangle the features we
didn't need. I have some code I am working on but don't have an ETA
for when it will be done.

At this point I am hoping we can build something based on Sridhar's
original patch that can addresses the items I brought up and shifts to
more of a 3 netdev model. If I am not mistaken we might be able to do
it as a separate driver that has a pair of calls that allow for
adding/removing a virt-bond that is provided with a MAC address and a
device pointer so that we can do the bits necessary to get ourselves
swapped with the original virtio device and identify the virtio as the
backup channel.

Thanks.

- Alex
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f149a160a8c5..0e58d364fde9 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -30,6 +30,8 @@ 
 #include <linux/cpu.h>
 #include <linux/average.h>
 #include <linux/filter.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <net/route.h>
 #include <net/xdp.h>
 
@@ -120,6 +122,15 @@  struct receive_queue {
 	struct xdp_rxq_info xdp_rxq;
 };
 
+struct virtnet_vf_pcpu_stats {
+	u64	rx_packets;
+	u64	rx_bytes;
+	u64	tx_packets;
+	u64	tx_bytes;
+	struct u64_stats_sync   syncp;
+	u32	tx_dropped;
+};
+
 struct virtnet_info {
 	struct virtio_device *vdev;
 	struct virtqueue *cvq;
@@ -182,6 +193,10 @@  struct virtnet_info {
 	u32 speed;
 
 	unsigned long guest_offloads;
+
+	/* State to manage the associated VF interface. */
+	struct net_device __rcu *vf_netdev;
+	struct virtnet_vf_pcpu_stats __percpu *vf_stats;
 };
 
 struct padded_vnet_hdr {
@@ -1314,16 +1329,53 @@  static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
 }
 
+/* Send skb on the slave VF device. */
+static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev,
+			   struct sk_buff *skb)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	unsigned int len = skb->len;
+	int rc;
+
+	skb->dev = vf_netdev;
+	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+	rc = dev_queue_xmit(skb);
+	if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
+		struct virtnet_vf_pcpu_stats *pcpu_stats
+			= this_cpu_ptr(vi->vf_stats);
+
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->tx_packets++;
+		pcpu_stats->tx_bytes += len;
+		u64_stats_update_end(&pcpu_stats->syncp);
+	} else {
+		this_cpu_inc(vi->vf_stats->tx_dropped);
+	}
+
+	return rc;
+}
+
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int qnum = skb_get_queue_mapping(skb);
 	struct send_queue *sq = &vi->sq[qnum];
+	struct net_device *vf_netdev;
 	int err;
 	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
 	bool kick = !skb->xmit_more;
 	bool use_napi = sq->napi.weight;
 
+	/* If VF is present and up then redirect packets
+	 * called with rcu_read_lock_bh
+	 */
+	vf_netdev = rcu_dereference_bh(vi->vf_netdev);
+	if (vf_netdev && netif_running(vf_netdev) &&
+	    !netpoll_tx_running(dev) &&
+	    is_unicast_ether_addr(eth_hdr(skb)->h_dest))
+		return virtnet_vf_xmit(dev, vf_netdev, skb);
+
 	/* Free up any pending old buffers before queueing new ones. */
 	free_old_xmit_skbs(sq);
 
@@ -1470,10 +1522,41 @@  static int virtnet_set_mac_address(struct net_device *dev, void *p)
 	return ret;
 }
 
+static void virtnet_get_vf_stats(struct net_device *dev,
+				 struct virtnet_vf_pcpu_stats *tot)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	memset(tot, 0, sizeof(*tot));
+
+	for_each_possible_cpu(i) {
+		const struct virtnet_vf_pcpu_stats *stats
+				= per_cpu_ptr(vi->vf_stats, i);
+		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+		unsigned int start;
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->syncp);
+			rx_packets = stats->rx_packets;
+			tx_packets = stats->tx_packets;
+			rx_bytes = stats->rx_bytes;
+			tx_bytes = stats->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+		tot->rx_packets += rx_packets;
+		tot->tx_packets += tx_packets;
+		tot->rx_bytes   += rx_bytes;
+		tot->tx_bytes   += tx_bytes;
+		tot->tx_dropped += stats->tx_dropped;
+	}
+}
+
 static void virtnet_stats(struct net_device *dev,
 			  struct rtnl_link_stats64 *tot)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtnet_vf_pcpu_stats vf_stats;
 	int cpu;
 	unsigned int start;
 
@@ -1504,6 +1587,13 @@  static void virtnet_stats(struct net_device *dev,
 	tot->rx_dropped = dev->stats.rx_dropped;
 	tot->rx_length_errors = dev->stats.rx_length_errors;
 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
+
+	virtnet_get_vf_stats(dev, &vf_stats);
+	tot->rx_packets += vf_stats.rx_packets;
+	tot->tx_packets += vf_stats.tx_packets;
+	tot->rx_bytes += vf_stats.rx_bytes;
+	tot->tx_bytes += vf_stats.tx_bytes;
+	tot->tx_dropped += vf_stats.tx_dropped;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -2635,6 +2725,13 @@  static int virtnet_probe(struct virtio_device *vdev)
 
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_BACKUP)) {
+		vi->vf_stats =
+			netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats);
+		if (!vi->vf_stats)
+			goto free_stats;
+	}
+
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
 	    virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
@@ -2668,7 +2765,7 @@  static int virtnet_probe(struct virtio_device *vdev)
 			 */
 			dev_err(&vdev->dev, "device MTU appears to have changed "
 				"it is now %d < %d", mtu, dev->min_mtu);
-			goto free_stats;
+			goto free_vf_stats;
 		}
 
 		dev->mtu = mtu;
@@ -2692,7 +2789,7 @@  static int virtnet_probe(struct virtio_device *vdev)
 	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
 	err = init_vqs(vi);
 	if (err)
-		goto free_stats;
+		goto free_vf_stats;
 
 #ifdef CONFIG_SYSFS
 	if (vi->mergeable_rx_bufs)
@@ -2747,6 +2844,8 @@  static int virtnet_probe(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 	free_receive_page_frags(vi);
 	virtnet_del_vqs(vi);
+free_vf_stats:
+	free_percpu(vi->vf_stats);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -2768,19 +2867,184 @@  static void remove_vq_common(struct virtnet_info *vi)
 	virtnet_del_vqs(vi);
 }
 
+static struct net_device *get_virtio_bymac(const u8 *mac)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		if (dev->netdev_ops != &virtnet_netdev)
+			continue;       /* not a virtio_net device */
+
+		if (ether_addr_equal(mac, dev->perm_addr))
+			return dev;
+	}
+
+	return NULL;
+}
+
+static struct net_device *get_virtio_byref(struct net_device *vf_netdev)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		struct virtnet_info *vi;
+
+		if (dev->netdev_ops != &virtnet_netdev)
+			continue;	/* not a virtio_net device */
+
+		vi = netdev_priv(dev);
+		if (rtnl_dereference(vi->vf_netdev) == vf_netdev)
+			return dev;	/* a match */
+	}
+
+	return NULL;
+}
+
+/* Called when VF is injecting data into network stack.
+ * Change the associated network device from VF to virtio.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
+	struct virtnet_info *vi = netdev_priv(ndev);
+	struct virtnet_vf_pcpu_stats *pcpu_stats =
+				this_cpu_ptr(vi->vf_stats);
+
+	skb->dev = ndev;
+
+	u64_stats_update_begin(&pcpu_stats->syncp);
+	pcpu_stats->rx_packets++;
+	pcpu_stats->rx_bytes += skb->len;
+	u64_stats_update_end(&pcpu_stats->syncp);
+
+	return RX_HANDLER_ANOTHER;
+}
+
+static int virtnet_vf_join(struct net_device *vf_netdev,
+			   struct net_device *ndev)
+{
+	int ret;
+
+	ret = netdev_rx_handler_register(vf_netdev,
+					 virtnet_vf_handle_frame, ndev);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not register virtio VF receive handler (err = %d)\n",
+			   ret);
+		goto rx_handler_failed;
+	}
+
+	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not set master device %s (err = %d)\n",
+			   ndev->name, ret);
+		goto upper_link_failed;
+	}
+
+	vf_netdev->flags |= IFF_SLAVE;
+
+	/* Align MTU of VF with master */
+	ret = dev_set_mtu(vf_netdev, ndev->mtu);
+	if (ret)
+		netdev_warn(vf_netdev,
+			    "unable to change mtu to %u\n", ndev->mtu);
+
+	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
+
+	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
+	return 0;
+
+upper_link_failed:
+	netdev_rx_handler_unregister(vf_netdev);
+rx_handler_failed:
+	return ret;
+}
+
+static int virtnet_register_vf(struct net_device *vf_netdev)
+{
+	struct net_device *ndev;
+	struct virtnet_info *vi;
+
+	if (vf_netdev->addr_len != ETH_ALEN)
+		return NOTIFY_DONE;
+
+	/* We will use the MAC address to locate the virtio_net interface to
+	 * associate with the VF interface. If we don't find a matching
+	 * virtio interface, move on.
+	 */
+	ndev = get_virtio_bymac(vf_netdev->perm_addr);
+	if (!ndev)
+		return NOTIFY_DONE;
+
+	vi = netdev_priv(ndev);
+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
+		return NOTIFY_DONE;
+
+	if (rtnl_dereference(vi->vf_netdev))
+		return NOTIFY_DONE;
+
+	if (virtnet_vf_join(vf_netdev, ndev) != 0)
+		return NOTIFY_DONE;
+
+	netdev_info(ndev, "VF registering %s\n", vf_netdev->name);
+
+	dev_hold(vf_netdev);
+	rcu_assign_pointer(vi->vf_netdev, vf_netdev);
+
+	return NOTIFY_OK;
+}
+
+static int virtnet_unregister_vf(struct net_device *vf_netdev)
+{
+	struct net_device *ndev;
+	struct virtnet_info *vi;
+
+	ndev = get_virtio_byref(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
+
+	vi = netdev_priv(ndev);
+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_BACKUP))
+		return NOTIFY_DONE;
+
+	netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name);
+
+	netdev_rx_handler_unregister(vf_netdev);
+	netdev_upper_dev_unlink(vf_netdev, ndev);
+	RCU_INIT_POINTER(vi->vf_netdev, NULL);
+	dev_put(vf_netdev);
+
+	return NOTIFY_OK;
+}
+
 static void virtnet_remove(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
+	struct net_device *vf_netdev;
 
 	virtnet_cpu_notif_remove(vi);
 
 	/* Make sure no work handler is accessing the device. */
 	flush_work(&vi->config_work);
 
+	rtnl_lock();
+	vf_netdev = rtnl_dereference(vi->vf_netdev);
+	if (vf_netdev)
+		virtnet_unregister_vf(vf_netdev);
+	rtnl_unlock();
+
 	unregister_netdev(vi->dev);
 
 	remove_vq_common(vi);
 
+	free_percpu(vi->vf_stats);
 	free_percpu(vi->stats);
 	free_netdev(vi->dev);
 }
@@ -2859,6 +3123,42 @@  static struct virtio_driver virtio_net_driver = {
 #endif
 };
 
+static int virtio_netdev_event(struct notifier_block *this,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	/* Skip our own events */
+	if (event_dev->netdev_ops == &virtnet_netdev)
+		return NOTIFY_DONE;
+
+	/* Avoid non-Ethernet type devices */
+	if (event_dev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	/* Avoid Vlan dev with same MAC registering as VF */
+	if (is_vlan_dev(event_dev))
+		return NOTIFY_DONE;
+
+	/* Avoid Bonding master dev with same MAC registering as VF */
+	if ((event_dev->priv_flags & IFF_BONDING) &&
+	    (event_dev->flags & IFF_MASTER))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return virtnet_register_vf(event_dev);
+	case NETDEV_UNREGISTER:
+		return virtnet_unregister_vf(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block virtio_netdev_notifier = {
+	.notifier_call = virtio_netdev_event,
+};
+
 static __init int virtio_net_driver_init(void)
 {
 	int ret;
@@ -2877,6 +3177,8 @@  static __init int virtio_net_driver_init(void)
         ret = register_virtio_driver(&virtio_net_driver);
 	if (ret)
 		goto err_virtio;
+
+	register_netdevice_notifier(&virtio_netdev_notifier);
 	return 0;
 err_virtio:
 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
@@ -2889,6 +3191,7 @@  module_init(virtio_net_driver_init);
 
 static __exit void virtio_net_driver_exit(void)
 {
+	unregister_netdevice_notifier(&virtio_netdev_notifier);
 	unregister_virtio_driver(&virtio_net_driver);
 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
 	cpuhp_remove_multi_state(virtionet_online);