diff mbox series

[ovs-dev] netdev-afxdp: Add start qid support.

Message ID 1612387304-68681-1-git-send-email-u9012063@gmail.com
State Changes Requested
Headers show
Series [ovs-dev] netdev-afxdp: Add start qid support. | expand

Commit Message

William Tu Feb. 3, 2021, 9:21 p.m. UTC
Mellanox card has different XSK design. It requires users to create
dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
to all queues, Mellanox only loads XDP program to a subset of its queue.

When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
queues in the channel with XSK RX and XSK TX queues, but it creates an
additional pair of queues for XSK in that channel. To distinguish
regular and XSK queues, mlx5 uses a different range of qids.
That means, if the card has 24 queues, queues 0..11 correspond to
regular queues, and queues 12..23 are XSK queues.
In this case, we should attach the netdev-afxdp with 'start-qid=12'.

I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
  $ ethtool -L enp2s0f0np0 combined 1
  # queue 0 is for non-XDP traffic, queue 1 is for XSK
  $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
note: we need additionally add flow-redirect rule to queue 1

Tested-at: https://github.com/williamtu/ovs-travis/actions/runs/535141041
Signed-off-by: William Tu <u9012063@gmail.com>
---
 Documentation/intro/install/afxdp.rst |  2 ++
 lib/netdev-afxdp.c                    | 23 ++++++++++++++++++-----
 lib/netdev-linux-private.h            |  1 +
 vswitchd/vswitch.xml                  |  8 ++++++++
 4 files changed, 29 insertions(+), 5 deletions(-)

Comments

Gregory Rose Feb. 4, 2021, 11:17 p.m. UTC | #1
On 2/3/2021 1:21 PM, William Tu wrote:
> Mellanox card has different XSK design. It requires users to create
> dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
> to all queues, Mellanox only loads XDP program to a subset of its queue.
> 
> When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
> queues in the channel with XSK RX and XSK TX queues, but it creates an
> additional pair of queues for XSK in that channel. To distinguish
> regular and XSK queues, mlx5 uses a different range of qids.
> That means, if the card has 24 queues, queues 0..11 correspond to
> regular queues, and queues 12..23 are XSK queues.
> In this case, we should attach the netdev-afxdp with 'start-qid=12'.
> 
> I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
>    $ ethtool -L enp2s0f0np0 combined 1
>    # queue 0 is for non-XDP traffic, queue 1 is for XSK
>    $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
> note: we need additionally add flow-redirect rule to queue 1

Seems awfully hardware dependent.  Is this just for Mellanox or does
it have general usefulness?

- Greg

> 
> Tested-at: https://github.com/williamtu/ovs-travis/actions/runs/535141041
> Signed-off-by: William Tu <u9012063@gmail.com>
> ---
>   Documentation/intro/install/afxdp.rst |  2 ++
>   lib/netdev-afxdp.c                    | 23 ++++++++++++++++++-----
>   lib/netdev-linux-private.h            |  1 +
>   vswitchd/vswitch.xml                  |  8 ++++++++
>   4 files changed, 29 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst
> index f2643e0d41a1..eac298c52575 100644
> --- a/Documentation/intro/install/afxdp.rst
> +++ b/Documentation/intro/install/afxdp.rst
> @@ -204,6 +204,8 @@ more details):
>    * ``use-need-wakeup``: default ``true`` if libbpf supports it,
>      otherwise ``false``.
>   
> +* ``start-qid``: default ``0``.
> +
>   For example, to use 1 PMD (on core 4) on 1 queue (queue 0) device,
>   configure these options: ``pmd-cpu-mask``, ``pmd-rxq-affinity``, and
>   ``n_rxq``::
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index 482400d8d135..36f6a323b1bc 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -458,12 +458,13 @@ xsk_configure_queue(struct netdev_linux *dev, int ifindex, int queue_id,
>       VLOG_DBG("%s: configuring queue: %d, mode: %s, use-need-wakeup: %s.",
>                netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name,
>                dev->use_need_wakeup ? "true" : "false");
> -    xsk_info = xsk_configure(ifindex, queue_id, mode, dev->use_need_wakeup,
> -                             report_socket_failures);
> +    xsk_info = xsk_configure(ifindex, dev->startqid + queue_id, mode,
> +                             dev->use_need_wakeup, report_socket_failures);
>       if (!xsk_info) {
>           VLOG(report_socket_failures ? VLL_ERR : VLL_DBG,
> -             "%s: Failed to create AF_XDP socket on queue %d in %s mode.",
> -             netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name);
> +             "%s: Failed to create AF_XDP socket on queue %d+%d in %s mode.",
> +             netdev_get_name(&dev->up), dev->startqid, queue_id,
> +             xdp_modes[mode].name);
>           dev->xsks[queue_id] = NULL;
>           return -1;
>       }
> @@ -604,6 +605,7 @@ netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
>       enum afxdp_mode xdp_mode;
>       bool need_wakeup;
>       int new_n_rxq;
> +    int new_startqid;
>   
>       ovs_mutex_lock(&dev->mutex);
>       new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
> @@ -637,12 +639,18 @@ netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
>       }
>   #endif
>   
> +    /* TODO: need to check
> +     * new_startqid + new_n_rxq > total dev's queues. */
> +    new_startqid = smap_get_int(args, "start-qid", 0);
> +
>       if (dev->requested_n_rxq != new_n_rxq
>           || dev->requested_xdp_mode != xdp_mode
> -        || dev->requested_need_wakeup != need_wakeup) {
> +        || dev->requested_need_wakeup != need_wakeup
> +        || dev->requested_startqid != new_startqid) {
>           dev->requested_n_rxq = new_n_rxq;
>           dev->requested_xdp_mode = xdp_mode;
>           dev->requested_need_wakeup = need_wakeup;
> +        dev->requested_startqid = new_startqid;
>           netdev_request_reconfigure(netdev);
>       }
>       ovs_mutex_unlock(&dev->mutex);
> @@ -661,6 +669,7 @@ netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args)
>                       xdp_modes[dev->xdp_mode_in_use].name);
>       smap_add_format(args, "use-need-wakeup", "%s",
>                       dev->use_need_wakeup ? "true" : "false");
> +    smap_add_format(args, "start-qid", "%d", dev->startqid);
>       ovs_mutex_unlock(&dev->mutex);
>       return 0;
>   }
> @@ -696,6 +705,7 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
>       if (netdev->n_rxq == dev->requested_n_rxq
>           && dev->xdp_mode == dev->requested_xdp_mode
>           && dev->use_need_wakeup == dev->requested_need_wakeup
> +        && dev->startqid == dev->requested_startqid
>           && dev->xsks) {
>           goto out;
>       }
> @@ -713,6 +723,7 @@ netdev_afxdp_reconfigure(struct netdev *netdev)
>           VLOG_ERR("setrlimit(RLIMIT_MEMLOCK) failed: %s", ovs_strerror(errno));
>       }
>       dev->use_need_wakeup = dev->requested_need_wakeup;
> +    dev->startqid = dev->requested_startqid;
>   
>       err = xsk_configure_all(netdev);
>       if (err) {
> @@ -1177,12 +1188,14 @@ netdev_afxdp_construct(struct netdev *netdev)
>       /* Queues should not be used before the first reconfiguration. Clearing. */
>       netdev->n_rxq = 0;
>       netdev->n_txq = 0;
> +    dev->startqid = 0;
>       dev->xdp_mode = OVS_AF_XDP_MODE_UNSPEC;
>       dev->xdp_mode_in_use = OVS_AF_XDP_MODE_UNSPEC;
>   
>       dev->requested_n_rxq = NR_QUEUE;
>       dev->requested_xdp_mode = OVS_AF_XDP_MODE_BEST_EFFORT;
>       dev->requested_need_wakeup = NEED_WAKEUP_DEFAULT;
> +    dev->requested_startqid = 0;
>   
>       dev->xsks = NULL;
>       dev->tx_locks = NULL;
> diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
> index c7c515f70700..242ce1659614 100644
> --- a/lib/netdev-linux-private.h
> +++ b/lib/netdev-linux-private.h
> @@ -109,6 +109,7 @@ struct netdev_linux {
>       /* AF_XDP information. */
>       struct xsk_socket_info **xsks;
>       int requested_n_rxq;
> +    int startqid, requested_startqid;
>   
>       enum afxdp_mode xdp_mode;               /* Configured AF_XDP mode. */
>       enum afxdp_mode requested_xdp_mode;     /* Requested  AF_XDP mode. */
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index a2ad84edefa9..47e14ba67c3a 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -3290,6 +3290,14 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \
>           </p>
>         </column>
>   
> +      <column name="options" key="start-qid"
> +              type='{"type": "integer", "minInteger": 0, "maxInteger": 32}'>
> +        <p>
> +          Specifies the starting XDP socket's queue id.
> +          Defaults to 0.
> +        </p>
> +      </column>
> +
>         <column name="options" key="vhost-server-path"
>                 type='{"type": "string"}'>
>           <p>
>
William Tu Feb. 5, 2021, 3:08 a.m. UTC | #2
On Thu, Feb 4, 2021 at 3:17 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>
>
>
> On 2/3/2021 1:21 PM, William Tu wrote:
> > Mellanox card has different XSK design. It requires users to create
> > dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
> > to all queues, Mellanox only loads XDP program to a subset of its queue.
> >
> > When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
> > queues in the channel with XSK RX and XSK TX queues, but it creates an
> > additional pair of queues for XSK in that channel. To distinguish
> > regular and XSK queues, mlx5 uses a different range of qids.
> > That means, if the card has 24 queues, queues 0..11 correspond to
> > regular queues, and queues 12..23 are XSK queues.
> > In this case, we should attach the netdev-afxdp with 'start-qid=12'.
> >
> > I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
> >    $ ethtool -L enp2s0f0np0 combined 1
> >    # queue 0 is for non-XDP traffic, queue 1 is for XSK
> >    $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
> > note: we need additionally add flow-redirect rule to queue 1
>
> Seems awfully hardware dependent.  Is this just for Mellanox or does
> it have general usefulness?
>
It is just Mellanox's design which requires pre-configure the flow-director.
I only have cards from Intel and Mellanox so I don't know about other vendors.

Thanks,
William
Gregory Rose Feb. 5, 2021, 9:08 p.m. UTC | #3
On 2/4/2021 7:08 PM, William Tu wrote:
> On Thu, Feb 4, 2021 at 3:17 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>>
>>
>>
>> On 2/3/2021 1:21 PM, William Tu wrote:
>>> Mellanox card has different XSK design. It requires users to create
>>> dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
>>> to all queues, Mellanox only loads XDP program to a subset of its queue.
>>>
>>> When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
>>> queues in the channel with XSK RX and XSK TX queues, but it creates an
>>> additional pair of queues for XSK in that channel. To distinguish
>>> regular and XSK queues, mlx5 uses a different range of qids.
>>> That means, if the card has 24 queues, queues 0..11 correspond to
>>> regular queues, and queues 12..23 are XSK queues.
>>> In this case, we should attach the netdev-afxdp with 'start-qid=12'.
>>>
>>> I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
>>>     $ ethtool -L enp2s0f0np0 combined 1
>>>     # queue 0 is for non-XDP traffic, queue 1 is for XSK
>>>     $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
>>> note: we need additionally add flow-redirect rule to queue 1
>>
>> Seems awfully hardware dependent.  Is this just for Mellanox or does
>> it have general usefulness?
>>
> It is just Mellanox's design which requires pre-configure the flow-director.
> I only have cards from Intel and Mellanox so I don't know about other vendors.
> 
> Thanks,
> William
> 

I think we need to abstract the HW layer a little bit.  This start-qid
option is specific to a single piece of HW, at least at this point.
We should expect that further HW  specific requirements for
different NIC vendors will come up in the future.  I suggest
adding a hw_options:mellanox:start-qid type hierarchy  so that
as new HW requirements come up we can easily scale.  It will
also make adding new vendors easier in the future.

Even with NIC vendors you can't always count on each new generation
design to always keep old requirements and methods for feature
enablement.

What do you think?

- Greg
William Tu Feb. 6, 2021, 5 p.m. UTC | #4
On Fri, Feb 5, 2021 at 1:08 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>
>
>
> On 2/4/2021 7:08 PM, William Tu wrote:
> > On Thu, Feb 4, 2021 at 3:17 PM Gregory Rose <gvrose8192@gmail.com> wrote:
> >>
> >>
> >>
> >> On 2/3/2021 1:21 PM, William Tu wrote:
> >>> Mellanox card has different XSK design. It requires users to create
> >>> dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
> >>> to all queues, Mellanox only loads XDP program to a subset of its queue.
> >>>
> >>> When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
> >>> queues in the channel with XSK RX and XSK TX queues, but it creates an
> >>> additional pair of queues for XSK in that channel. To distinguish
> >>> regular and XSK queues, mlx5 uses a different range of qids.
> >>> That means, if the card has 24 queues, queues 0..11 correspond to
> >>> regular queues, and queues 12..23 are XSK queues.
> >>> In this case, we should attach the netdev-afxdp with 'start-qid=12'.
> >>>
> >>> I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
> >>>     $ ethtool -L enp2s0f0np0 combined 1
> >>>     # queue 0 is for non-XDP traffic, queue 1 is for XSK
> >>>     $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
> >>> note: we need additionally add flow-redirect rule to queue 1
> >>
> >> Seems awfully hardware dependent.  Is this just for Mellanox or does
> >> it have general usefulness?
> >>
> > It is just Mellanox's design which requires pre-configure the flow-director.
> > I only have cards from Intel and Mellanox so I don't know about other vendors.
> >
> > Thanks,
> > William
> >
>
> I think we need to abstract the HW layer a little bit.  This start-qid
> option is specific to a single piece of HW, at least at this point.
> We should expect that further HW  specific requirements for
> different NIC vendors will come up in the future.  I suggest
> adding a hw_options:mellanox:start-qid type hierarchy  so that
> as new HW requirements come up we can easily scale.  It will
> also make adding new vendors easier in the future.
>
> Even with NIC vendors you can't always count on each new generation
> design to always keep old requirements and methods for feature
> enablement.
>
> What do you think?
>
Thanks for the feedback.
So far I don't know whether other vendors will need this option or not.
I think adding another "hw_options" is a little confusing because this
is already an option on the device.
Looking at AF_XDP driver at DPDK, it also has similar option:
see start_queue
https://doc.dpdk.org/guides/nics/af_xdp.html
Toshiaki Makita Feb. 7, 2021, 4:05 p.m. UTC | #5
On 2021/02/07 2:00, William Tu wrote:
> On Fri, Feb 5, 2021 at 1:08 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>> On 2/4/2021 7:08 PM, William Tu wrote:
>>> On Thu, Feb 4, 2021 at 3:17 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>>>> On 2/3/2021 1:21 PM, William Tu wrote:
>>>>> Mellanox card has different XSK design. It requires users to create
>>>>> dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
>>>>> to all queues, Mellanox only loads XDP program to a subset of its queue.
>>>>>
>>>>> When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
>>>>> queues in the channel with XSK RX and XSK TX queues, but it creates an
>>>>> additional pair of queues for XSK in that channel. To distinguish
>>>>> regular and XSK queues, mlx5 uses a different range of qids.
>>>>> That means, if the card has 24 queues, queues 0..11 correspond to
>>>>> regular queues, and queues 12..23 are XSK queues.
>>>>> In this case, we should attach the netdev-afxdp with 'start-qid=12'.
>>>>>
>>>>> I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
>>>>>      $ ethtool -L enp2s0f0np0 combined 1
>>>>>      # queue 0 is for non-XDP traffic, queue 1 is for XSK
>>>>>      $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
>>>>> note: we need additionally add flow-redirect rule to queue 1
>>>>
>>>> Seems awfully hardware dependent.  Is this just for Mellanox or does
>>>> it have general usefulness?
>>>>
>>> It is just Mellanox's design which requires pre-configure the flow-director.
>>> I only have cards from Intel and Mellanox so I don't know about other vendors.
>>>
>>> Thanks,
>>> William
>>>
>>
>> I think we need to abstract the HW layer a little bit.  This start-qid
>> option is specific to a single piece of HW, at least at this point.
>> We should expect that further HW  specific requirements for
>> different NIC vendors will come up in the future.  I suggest
>> adding a hw_options:mellanox:start-qid type hierarchy  so that
>> as new HW requirements come up we can easily scale.  It will
>> also make adding new vendors easier in the future.
>>
>> Even with NIC vendors you can't always count on each new generation
>> design to always keep old requirements and methods for feature
>> enablement.
>>
>> What do you think?
>>
> Thanks for the feedback.
> So far I don't know whether other vendors will need this option or not.

FWIU, this api "The lower half of the available amount of RX queues are regular 
queues, and the upper half are XSK RX queues." is the result of long discussion to 
support dedicated/isolated XSK rings, which is not meant for a mellanox-specific 
feature.

https://patchwork.ozlabs.org/project/netdev/cover/20190524093431.20887-1-maximmi@mellanox.com/
https://patchwork.ozlabs.org/project/netdev/cover/20190612155605.22450-1-maximmi@mellanox.com/

Toshiaki Makita

> I think adding another "hw_options" is a little confusing because this
> is already an option on the device.
> Looking at AF_XDP driver at DPDK, it also has similar option:
> see start_queue
> https://doc.dpdk.org/guides/nics/af_xdp.html
Ilya Maximets Feb. 8, 2021, 12:58 p.m. UTC | #6
On 2/7/21 5:05 PM, Toshiaki Makita wrote:
> On 2021/02/07 2:00, William Tu wrote:
>> On Fri, Feb 5, 2021 at 1:08 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>>> On 2/4/2021 7:08 PM, William Tu wrote:
>>>> On Thu, Feb 4, 2021 at 3:17 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>>>>> On 2/3/2021 1:21 PM, William Tu wrote:
>>>>>> Mellanox card has different XSK design. It requires users to create
>>>>>> dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
>>>>>> to all queues, Mellanox only loads XDP program to a subset of its queue.
>>>>>>
>>>>>> When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
>>>>>> queues in the channel with XSK RX and XSK TX queues, but it creates an
>>>>>> additional pair of queues for XSK in that channel. To distinguish
>>>>>> regular and XSK queues, mlx5 uses a different range of qids.
>>>>>> That means, if the card has 24 queues, queues 0..11 correspond to
>>>>>> regular queues, and queues 12..23 are XSK queues.
>>>>>> In this case, we should attach the netdev-afxdp with 'start-qid=12'.
>>>>>>
>>>>>> I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
>>>>>>      $ ethtool -L enp2s0f0np0 combined 1
>>>>>>      # queue 0 is for non-XDP traffic, queue 1 is for XSK
>>>>>>      $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
>>>>>> note: we need additionally add flow-redirect rule to queue 1
>>>>>
>>>>> Seems awfully hardware dependent.  Is this just for Mellanox or does
>>>>> it have general usefulness?
>>>>>
>>>> It is just Mellanox's design which requires pre-configure the flow-director.
>>>> I only have cards from Intel and Mellanox so I don't know about other vendors.
>>>>
>>>> Thanks,
>>>> William
>>>>
>>>
>>> I think we need to abstract the HW layer a little bit.  This start-qid
>>> option is specific to a single piece of HW, at least at this point.
>>> We should expect that further HW  specific requirements for
>>> different NIC vendors will come up in the future.  I suggest
>>> adding a hw_options:mellanox:start-qid type hierarchy  so that
>>> as new HW requirements come up we can easily scale.  It will
>>> also make adding new vendors easier in the future.
>>>
>>> Even with NIC vendors you can't always count on each new generation
>>> design to always keep old requirements and methods for feature
>>> enablement.
>>>
>>> What do you think?
>>>
>> Thanks for the feedback.
>> So far I don't know whether other vendors will need this option or not.
> 
> FWIU, this api "The lower half of the available amount of RX queues are regular queues, and the upper half are XSK RX queues." is the result of long discussion to support dedicated/isolated XSK rings, which is not meant for a mellanox-specific feature.
> 
> https://patchwork.ozlabs.org/project/netdev/cover/20190524093431.20887-1-maximmi@mellanox.com/
> https://patchwork.ozlabs.org/project/netdev/cover/20190612155605.22450-1-maximmi@mellanox.com/
> 
> Toshiaki Makita

Thanks for the links.  Very helpful.

From what I understand lower half of queues should still work, i.e.
it should still be possible to attach AF_XDP socket to them.  But
they will not work in zero-copy mode ("generic" only?).
William, could you check that?  Does it work and with which mode
"best-effort" ends up with?  And what kind of errors libbpf returns
if we're trying to enable zero-copy?

There are still few unanswered questions in those discussions and
a clear lack of documentation.  It seems that it's a work-in-progress,
intermediate step towards some better user API.  However it's unclear
how this API will look like and when it will be implemented.


For BPF list and maintainers:

Is it possible to have some of this behavior documented?
How can application determine which netdevs are using this upper/bottom
half schema for their regular and xsk channels/queues?  How users
should do that without digging into the kernel code or spending
few hours googling for presentations from some conferences?

And I actually failed to find any written reference to the fact that
I have to manually configure redirection of the traffic in order to
receive it on XSK queues/channels and not on regular ones.  This is
very confusing and hard to understand, especially for a regular OVS
user who is not familiar with XDP and kernel internals and just wants
to utilize faster userspace networking.

> 
>> I think adding another "hw_options" is a little confusing because this
>> is already an option on the device.
>> Looking at AF_XDP driver at DPDK, it also has similar option:
>> see start_queue
>> https://doc.dpdk.org/guides/nics/af_xdp.html

This option for DPDK mainly positioned as a way to utilize multi-queue
and a way to create different DPDK ports from different ranges of
queues of the same device, so it's not exactly the same thing.  But
I see that it likely had a double purpose that wasn't mentioned in a
commit message or documentation.

I think, for now we can have this option in OVS in a same way, i.e.
as a way to partition the device or as a way to use particular
range of queues from the device instead of using it as a whole.
This way we will not need to have any vendor-specific knobs while
allowing to use generic knob to utilize vendor-specific restrictions
at the same time.

BTW, I didn't look at the patch itself yet, will get back with
the actual review later.

Best regards, Ilya Maximets.
Gregory Rose Feb. 8, 2021, 8:40 p.m. UTC | #7
On 2/6/2021 9:00 AM, William Tu wrote:
> On Fri, Feb 5, 2021 at 1:08 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>>
>>
>>
>> On 2/4/2021 7:08 PM, William Tu wrote:
>>> On Thu, Feb 4, 2021 at 3:17 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2/3/2021 1:21 PM, William Tu wrote:
>>>>> Mellanox card has different XSK design. It requires users to create
>>>>> dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
>>>>> to all queues, Mellanox only loads XDP program to a subset of its queue.
>>>>>
>>>>> When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
>>>>> queues in the channel with XSK RX and XSK TX queues, but it creates an
>>>>> additional pair of queues for XSK in that channel. To distinguish
>>>>> regular and XSK queues, mlx5 uses a different range of qids.
>>>>> That means, if the card has 24 queues, queues 0..11 correspond to
>>>>> regular queues, and queues 12..23 are XSK queues.
>>>>> In this case, we should attach the netdev-afxdp with 'start-qid=12'.
>>>>>
>>>>> I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
>>>>>      $ ethtool -L enp2s0f0np0 combined 1
>>>>>      # queue 0 is for non-XDP traffic, queue 1 is for XSK
>>>>>      $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
>>>>> note: we need additionally add flow-redirect rule to queue 1
>>>>
>>>> Seems awfully hardware dependent.  Is this just for Mellanox or does
>>>> it have general usefulness?
>>>>
>>> It is just Mellanox's design which requires pre-configure the flow-director.
>>> I only have cards from Intel and Mellanox so I don't know about other vendors.
>>>
>>> Thanks,
>>> William
>>>
>>
>> I think we need to abstract the HW layer a little bit.  This start-qid
>> option is specific to a single piece of HW, at least at this point.
>> We should expect that further HW  specific requirements for
>> different NIC vendors will come up in the future.  I suggest
>> adding a hw_options:mellanox:start-qid type hierarchy  so that
>> as new HW requirements come up we can easily scale.  It will
>> also make adding new vendors easier in the future.
>>
>> Even with NIC vendors you can't always count on each new generation
>> design to always keep old requirements and methods for feature
>> enablement.
>>
>> What do you think?
>>
> Thanks for the feedback.
> So far I don't know whether other vendors will need this option or not.
> I think adding another "hw_options" is a little confusing because this
> is already an option on the device.
> Looking at AF_XDP driver at DPDK, it also has similar option:
> see start_queue
> https://doc.dpdk.org/guides/nics/af_xdp.html
> 

OK, makes sense in this context then.

Thanks,

- Greg
William Tu Feb. 8, 2021, 10:35 p.m. UTC | #8
On Mon, Feb 8, 2021 at 4:58 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 2/7/21 5:05 PM, Toshiaki Makita wrote:
> > On 2021/02/07 2:00, William Tu wrote:
> >> On Fri, Feb 5, 2021 at 1:08 PM Gregory Rose <gvrose8192@gmail.com> wrote:
> >>> On 2/4/2021 7:08 PM, William Tu wrote:
> >>>> On Thu, Feb 4, 2021 at 3:17 PM Gregory Rose <gvrose8192@gmail.com> wrote:
> >>>>> On 2/3/2021 1:21 PM, William Tu wrote:
> >>>>>> Mellanox card has different XSK design. It requires users to create
> >>>>>> dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
> >>>>>> to all queues, Mellanox only loads XDP program to a subset of its queue.
> >>>>>>
> >>>>>> When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
> >>>>>> queues in the channel with XSK RX and XSK TX queues, but it creates an
> >>>>>> additional pair of queues for XSK in that channel. To distinguish
> >>>>>> regular and XSK queues, mlx5 uses a different range of qids.
> >>>>>> That means, if the card has 24 queues, queues 0..11 correspond to
> >>>>>> regular queues, and queues 12..23 are XSK queues.
> >>>>>> In this case, we should attach the netdev-afxdp with 'start-qid=12'.
> >>>>>>
> >>>>>> I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
> >>>>>>      $ ethtool -L enp2s0f0np0 combined 1
> >>>>>>      # queue 0 is for non-XDP traffic, queue 1 is for XSK
> >>>>>>      $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
> >>>>>> note: we need additionally add flow-redirect rule to queue 1
> >>>>>
> >>>>> Seems awfully hardware dependent.  Is this just for Mellanox or does
> >>>>> it have general usefulness?
> >>>>>
> >>>> It is just Mellanox's design which requires pre-configure the flow-director.
> >>>> I only have cards from Intel and Mellanox so I don't know about other vendors.
> >>>>
> >>>> Thanks,
> >>>> William
> >>>>
> >>>
> >>> I think we need to abstract the HW layer a little bit.  This start-qid
> >>> option is specific to a single piece of HW, at least at this point.
> >>> We should expect that further HW  specific requirements for
> >>> different NIC vendors will come up in the future.  I suggest
> >>> adding a hw_options:mellanox:start-qid type hierarchy  so that
> >>> as new HW requirements come up we can easily scale.  It will
> >>> also make adding new vendors easier in the future.
> >>>
> >>> Even with NIC vendors you can't always count on each new generation
> >>> design to always keep old requirements and methods for feature
> >>> enablement.
> >>>
> >>> What do you think?
> >>>
> >> Thanks for the feedback.
> >> So far I don't know whether other vendors will need this option or not.
> >
> > FWIU, this api "The lower half of the available amount of RX queues are regular queues, and the upper half are XSK RX queues." is the result of long discussion to support dedicated/isolated XSK rings, which is not meant for a mellanox-specific feature.
> >
> > https://patchwork.ozlabs.org/project/netdev/cover/20190524093431.20887-1-maximmi@mellanox.com/
> > https://patchwork.ozlabs.org/project/netdev/cover/20190612155605.22450-1-maximmi@mellanox.com/
> >
> > Toshiaki Makita
>
> Thanks for the links.  Very helpful.
>
> From what I understand lower half of queues should still work, i.e.
> it should still be possible to attach AF_XDP socket to them.  But
> they will not work in zero-copy mode ("generic" only?).
> William, could you check that?  Does it work and with which mode
> "best-effort" ends up with?  And what kind of errors libbpf returns
> if we're trying to enable zero-copy?

Thanks for your feedback.
Yes, only zero-copy mode needs to be aware of this, meaning zero-copy
mode has to use the upper half of the queues (the start-qid option here).
Native mode and SKB mode works OK on upper and lower queues.
When attaching zc XSK to lower half queue, libbpf returns EINVAL at
xsk_socket__create().

William
Ilya Maximets Feb. 9, 2021, 10:22 a.m. UTC | #9
On 2/8/21 11:35 PM, William Tu wrote:
> On Mon, Feb 8, 2021 at 4:58 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>>
>> On 2/7/21 5:05 PM, Toshiaki Makita wrote:
>>> On 2021/02/07 2:00, William Tu wrote:
>>>> On Fri, Feb 5, 2021 at 1:08 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>>>>> On 2/4/2021 7:08 PM, William Tu wrote:
>>>>>> On Thu, Feb 4, 2021 at 3:17 PM Gregory Rose <gvrose8192@gmail.com> wrote:
>>>>>>> On 2/3/2021 1:21 PM, William Tu wrote:
>>>>>>>> Mellanox card has different XSK design. It requires users to create
>>>>>>>> dedicated queues for XSK. Unlike Intel's NIC which loads XDP program
>>>>>>>> to all queues, Mellanox only loads XDP program to a subset of its queue.
>>>>>>>>
>>>>>>>> When OVS uses AF_XDP with mlx5, it doesn't replace the existing RX and TX
>>>>>>>> queues in the channel with XSK RX and XSK TX queues, but it creates an
>>>>>>>> additional pair of queues for XSK in that channel. To distinguish
>>>>>>>> regular and XSK queues, mlx5 uses a different range of qids.
>>>>>>>> That means, if the card has 24 queues, queues 0..11 correspond to
>>>>>>>> regular queues, and queues 12..23 are XSK queues.
>>>>>>>> In this case, we should attach the netdev-afxdp with 'start-qid=12'.
>>>>>>>>
>>>>>>>> I tested using Mellanox Connect-X 6Dx, by setting 'start-qid=1', and:
>>>>>>>>      $ ethtool -L enp2s0f0np0 combined 1
>>>>>>>>      # queue 0 is for non-XDP traffic, queue 1 is for XSK
>>>>>>>>      $ ethtool -N enp2s0f0np0 flow-type udp4 action 1
>>>>>>>> note: we need additionally add flow-redirect rule to queue 1
>>>>>>>
>>>>>>> Seems awfully hardware dependent.  Is this just for Mellanox or does
>>>>>>> it have general usefulness?
>>>>>>>
>>>>>> It is just Mellanox's design which requires pre-configure the flow-director.
>>>>>> I only have cards from Intel and Mellanox so I don't know about other vendors.
>>>>>>
>>>>>> Thanks,
>>>>>> William
>>>>>>
>>>>>
>>>>> I think we need to abstract the HW layer a little bit.  This start-qid
>>>>> option is specific to a single piece of HW, at least at this point.
>>>>> We should expect that further HW  specific requirements for
>>>>> different NIC vendors will come up in the future.  I suggest
>>>>> adding a hw_options:mellanox:start-qid type hierarchy  so that
>>>>> as new HW requirements come up we can easily scale.  It will
>>>>> also make adding new vendors easier in the future.
>>>>>
>>>>> Even with NIC vendors you can't always count on each new generation
>>>>> design to always keep old requirements and methods for feature
>>>>> enablement.
>>>>>
>>>>> What do you think?
>>>>>
>>>> Thanks for the feedback.
>>>> So far I don't know whether other vendors will need this option or not.
>>>
>>> FWIU, this api "The lower half of the available amount of RX queues are regular queues, and the upper half are XSK RX queues." is the result of long discussion to support dedicated/isolated XSK rings, which is not meant for a mellanox-specific feature.
>>>
>>> https://patchwork.ozlabs.org/project/netdev/cover/20190524093431.20887-1-maximmi@mellanox.com/
>>> https://patchwork.ozlabs.org/project/netdev/cover/20190612155605.22450-1-maximmi@mellanox.com/
>>>
>>> Toshiaki Makita
>>
>> Thanks for the links.  Very helpful.
>>
>> From what I understand lower half of queues should still work, i.e.
>> it should still be possible to attach AF_XDP socket to them.  But
>> they will not work in zero-copy mode ("generic" only?).
>> William, could you check that?  Does it work and with which mode
>> "best-effort" ends up with?  And what kind of errors libbpf returns
>> if we're trying to enable zero-copy?
> 
> Thanks for your feedback.
> Yes, only zero-copy mode needs to be aware of this, meaning zero-copy
> mode has to use the upper half of the queues (the start-qid option here).
> Native mode and SKB mode works OK on upper and lower queues.
> When attaching zc XSK to lower half queue, libbpf returns EINVAL at
> xsk_socket__create().

OK.  Thanks for checking.
diff mbox series

Patch

diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst
index f2643e0d41a1..eac298c52575 100644
--- a/Documentation/intro/install/afxdp.rst
+++ b/Documentation/intro/install/afxdp.rst
@@ -204,6 +204,8 @@  more details):
  * ``use-need-wakeup``: default ``true`` if libbpf supports it,
    otherwise ``false``.
 
+* ``start-qid``: default ``0``.
+
 For example, to use 1 PMD (on core 4) on 1 queue (queue 0) device,
 configure these options: ``pmd-cpu-mask``, ``pmd-rxq-affinity``, and
 ``n_rxq``::
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index 482400d8d135..36f6a323b1bc 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -458,12 +458,13 @@  xsk_configure_queue(struct netdev_linux *dev, int ifindex, int queue_id,
     VLOG_DBG("%s: configuring queue: %d, mode: %s, use-need-wakeup: %s.",
              netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name,
              dev->use_need_wakeup ? "true" : "false");
-    xsk_info = xsk_configure(ifindex, queue_id, mode, dev->use_need_wakeup,
-                             report_socket_failures);
+    xsk_info = xsk_configure(ifindex, dev->startqid + queue_id, mode,
+                             dev->use_need_wakeup, report_socket_failures);
     if (!xsk_info) {
         VLOG(report_socket_failures ? VLL_ERR : VLL_DBG,
-             "%s: Failed to create AF_XDP socket on queue %d in %s mode.",
-             netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name);
+             "%s: Failed to create AF_XDP socket on queue %d+%d in %s mode.",
+             netdev_get_name(&dev->up), dev->startqid, queue_id,
+             xdp_modes[mode].name);
         dev->xsks[queue_id] = NULL;
         return -1;
     }
@@ -604,6 +605,7 @@  netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
     enum afxdp_mode xdp_mode;
     bool need_wakeup;
     int new_n_rxq;
+    int new_startqid;
 
     ovs_mutex_lock(&dev->mutex);
     new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
@@ -637,12 +639,18 @@  netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
     }
 #endif
 
+    /* TODO: need to check
+     * new_startqid + new_n_rxq > total dev's queues. */
+    new_startqid = smap_get_int(args, "start-qid", 0);
+
     if (dev->requested_n_rxq != new_n_rxq
         || dev->requested_xdp_mode != xdp_mode
-        || dev->requested_need_wakeup != need_wakeup) {
+        || dev->requested_need_wakeup != need_wakeup
+        || dev->requested_startqid != new_startqid) {
         dev->requested_n_rxq = new_n_rxq;
         dev->requested_xdp_mode = xdp_mode;
         dev->requested_need_wakeup = need_wakeup;
+        dev->requested_startqid = new_startqid;
         netdev_request_reconfigure(netdev);
     }
     ovs_mutex_unlock(&dev->mutex);
@@ -661,6 +669,7 @@  netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args)
                     xdp_modes[dev->xdp_mode_in_use].name);
     smap_add_format(args, "use-need-wakeup", "%s",
                     dev->use_need_wakeup ? "true" : "false");
+    smap_add_format(args, "start-qid", "%d", dev->startqid);
     ovs_mutex_unlock(&dev->mutex);
     return 0;
 }
@@ -696,6 +705,7 @@  netdev_afxdp_reconfigure(struct netdev *netdev)
     if (netdev->n_rxq == dev->requested_n_rxq
         && dev->xdp_mode == dev->requested_xdp_mode
         && dev->use_need_wakeup == dev->requested_need_wakeup
+        && dev->startqid == dev->requested_startqid
         && dev->xsks) {
         goto out;
     }
@@ -713,6 +723,7 @@  netdev_afxdp_reconfigure(struct netdev *netdev)
         VLOG_ERR("setrlimit(RLIMIT_MEMLOCK) failed: %s", ovs_strerror(errno));
     }
     dev->use_need_wakeup = dev->requested_need_wakeup;
+    dev->startqid = dev->requested_startqid;
 
     err = xsk_configure_all(netdev);
     if (err) {
@@ -1177,12 +1188,14 @@  netdev_afxdp_construct(struct netdev *netdev)
     /* Queues should not be used before the first reconfiguration. Clearing. */
     netdev->n_rxq = 0;
     netdev->n_txq = 0;
+    dev->startqid = 0;
     dev->xdp_mode = OVS_AF_XDP_MODE_UNSPEC;
     dev->xdp_mode_in_use = OVS_AF_XDP_MODE_UNSPEC;
 
     dev->requested_n_rxq = NR_QUEUE;
     dev->requested_xdp_mode = OVS_AF_XDP_MODE_BEST_EFFORT;
     dev->requested_need_wakeup = NEED_WAKEUP_DEFAULT;
+    dev->requested_startqid = 0;
 
     dev->xsks = NULL;
     dev->tx_locks = NULL;
diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
index c7c515f70700..242ce1659614 100644
--- a/lib/netdev-linux-private.h
+++ b/lib/netdev-linux-private.h
@@ -109,6 +109,7 @@  struct netdev_linux {
     /* AF_XDP information. */
     struct xsk_socket_info **xsks;
     int requested_n_rxq;
+    int startqid, requested_startqid;
 
     enum afxdp_mode xdp_mode;               /* Configured AF_XDP mode. */
     enum afxdp_mode requested_xdp_mode;     /* Requested  AF_XDP mode. */
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index a2ad84edefa9..47e14ba67c3a 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -3290,6 +3290,14 @@  ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \
         </p>
       </column>
 
+      <column name="options" key="start-qid"
+              type='{"type": "integer", "minInteger": 0, "maxInteger": 32}'>
+        <p>
+          Specifies the starting XDP socket's queue id.
+          Defaults to 0.
+        </p>
+      </column>
+
       <column name="options" key="vhost-server-path"
               type='{"type": "string"}'>
         <p>