diff mbox series

[ovs-dev,3/3] netdev-offload-dpdk: use flow transfer proxy mechanism

Message ID 20220530141550.368926-4-ivan.malov@oktetlabs.ru
State Superseded
Headers show
Series Rework the usage of DPDK transfer flow offloads | expand

Checks

Context Check Description
ovsrobot/apply-robot warning apply and check: warning
ovsrobot/github-robot-_Build_and_Test fail github build: failed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Ivan Malov May 30, 2022, 2:15 p.m. UTC
Manage "transfer" flows via the corresponding mechanism.
Doing so requires that the traffic source be specified
explicitly, via the corresponding pattern item.

Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
---
 lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++-------
 lib/netdev-dpdk.h         |  2 +-
 lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
 3 files changed, 103 insertions(+), 15 deletions(-)

Comments

0-day Robot May 30, 2022, 2:42 p.m. UTC | #1
Bleep bloop.  Greetings Ivan Malov, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Line is 80 characters long (recommended limit is 79)
#41 FILE: lib/netdev-dpdk.c:1126:
                                       &dev->flow_transfer_proxy_port_id, NULL);

ERROR: Inappropriate bracing around statement
#42 FILE: lib/netdev-dpdk.c:1127:
    if (ret == 0)

WARNING: Line is 80 characters long (recommended limit is 79)
#161 FILE: lib/netdev-dpdk.c:5399:
    ret = rte_flow_tunnel_action_decap_release(dev->flow_transfer_proxy_port_id,

Lines checked: 294, Warnings: 2, Errors: 1


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
Eli Britstein June 1, 2022, 9:17 a.m. UTC | #2
- Missing proper handling of the testpmd syntax logging. It changes the used port according to "transfer", but the log still uses netdev_dpdk_get_port_id().
- The usage of the "proxy" port for rte-flow implies that this proxy port is attached to OVS, otherwise it is not "started" and creation of flows will fail.

>-----Original Message-----
>From: Ivan Malov <ivan.malov@oktetlabs.ru>
>Sent: Monday, May 30, 2022 5:16 PM
>To: dev@openvswitch.org
>Cc: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets
><i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>; Eli Britstein
><elibr@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
><thomas@monjalon.net>; Stephen Hemminger
><stephen@networkplumber.org>; David Marchand
><david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>Coquelin <maxime.coquelin@redhat.com>
>Subject: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>mechanism
>
>External email: Use caution opening links or attachments
>
>
>Manage "transfer" flows via the corresponding mechanism.
>Doing so requires that the traffic source be specified explicitly, via the
>corresponding pattern item.
>
>Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
>Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>---
> lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++-------
> lib/netdev-dpdk.h         |  2 +-
> lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
> 3 files changed, 103 insertions(+), 15 deletions(-)
>
>diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>45e5d26d2..d0bf4613a 100644
>--- a/lib/netdev-dpdk.c
>+++ b/lib/netdev-dpdk.c
>@@ -420,6 +420,7 @@ enum dpdk_hw_ol_features {
>
> struct netdev_dpdk {
>     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
>+        dpdk_port_t flow_transfer_proxy_port_id;
>         dpdk_port_t port_id;
>
>         /* If true, device was attached by rte_eth_dev_attach(). */ @@ -1115,6
>+1116,23 @@ dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev)
>                   DPDK_PORT_ID_FMT, dev->port_id);
>     }
> }
>+
>+static void
>+dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev) {
>+    int ret;
>+
>+    ret = rte_flow_pick_transfer_proxy(dev->port_id,
>+                                       &dev->flow_transfer_proxy_port_id, NULL);
>+    if (ret == 0)
>+        return;
>+
>+    /*
>+     * The PMD does not indicate the proxy port.
>+     * It is OK to assume the proxy is unneeded.
>+     */
>+    dev->flow_transfer_proxy_port_id = dev->port_id; }
> #endif /* ALLOW_EXPERIMENTAL_API */
>
> static int
>@@ -1141,6 +1159,19 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>      * Request delivery of such metadata.
>      */
>     dpdk_eth_dev_init_rx_metadata(dev);
>+
>+    /*
>+     * Managing "transfer" flows requires that the user communicate them
>+     * via a port which has the privilege to control the embedded switch.
>+     * For some vendors, all ports in a given switching domain have
>+     * this privilege. For other vendors, it's only one port.
>+     *
>+     * Get the proxy port ID and remember it for later use.
>+     */
>+    dpdk_eth_dev_init_flow_transfer_proxy(dev);
>+#else /* ! ALLOW_EXPERIMENTAL_API */
>+    /* It is OK to assume the proxy is unneeded. */
>+    dev->flow_transfer_proxy_port_id = dev->port_id;
> #endif /* ALLOW_EXPERIMENTAL_API */
>
>     rte_eth_dev_info_get(dev->port_id, &info);
>@@ -5214,13 +5245,15 @@ out:
>
> int
> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>-                             struct rte_flow *rte_flow,
>+                             bool transfer, struct rte_flow *rte_flow,
>                              struct rte_flow_error *error)
> {
>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>     int ret;
>
>-    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
>+    ret = rte_flow_destroy(transfer ?
>+                           dev->flow_transfer_proxy_port_id : dev->port_id,
>+                           rte_flow, error);
>     return ret;
> }
>
>@@ -5234,7 +5267,19 @@ netdev_dpdk_rte_flow_create(struct netdev
>*netdev,
>     struct rte_flow *flow;
>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>
>-    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
>+#ifdef ALLOW_EXPERIMENTAL_API
>+    if (!attr->transfer) {
>+        /*
>+         * The 1st item in any pattern is a traffic source one.
>+         * It is unnecessary in the case of non-transfer rules.
>+         */
>+        ++(items);
>+    }
>+#endif /* ALLOW_EXPERIMENTAL_API */
>+
>+    flow = rte_flow_create(attr->transfer ?
>+                           dev->flow_transfer_proxy_port_id : dev->port_id,
>+                           attr, items, actions, error);
>     return flow;
> }
>
>@@ -5262,7 +5307,8 @@ netdev_dpdk_rte_flow_query_count(struct netdev
>*netdev,
>     }
>
>     dev = netdev_dpdk_cast(netdev);
>-    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
>+    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
>+                         actions, query, error);
>     return ret;
> }
>
>@@ -5284,8 +5330,8 @@ netdev_dpdk_rte_flow_tunnel_decap_set(struct
>netdev *netdev,
>
>     dev = netdev_dpdk_cast(netdev);
>     ovs_mutex_lock(&dev->mutex);
>-    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
>-                                    num_of_actions, error);
>+    ret = rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id,
>tunnel,
>+                                    actions, num_of_actions, error);
>     ovs_mutex_unlock(&dev->mutex);
>     return ret;
> }
>@@ -5306,8 +5352,8 @@ netdev_dpdk_rte_flow_tunnel_match(struct
>netdev *netdev,
>
>     dev = netdev_dpdk_cast(netdev);
>     ovs_mutex_lock(&dev->mutex);
>-    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items, num_of_items,
>-                                error);
>+    ret = rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id, tunnel,
>+                                items, num_of_items, error);
>     ovs_mutex_unlock(&dev->mutex);
>     return ret;
> }
>@@ -5328,7 +5374,8 @@ netdev_dpdk_rte_flow_get_restore_info(struct
>netdev *netdev,
>
>     dev = netdev_dpdk_cast(netdev);
>     ovs_mutex_lock(&dev->mutex);
>-    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
>+    ret = rte_flow_get_restore_info(dev->flow_transfer_proxy_port_id,
>+                                    m, info, error);
>     ovs_mutex_unlock(&dev->mutex);
>     return ret;
> }
>@@ -5349,8 +5396,8 @@
>netdev_dpdk_rte_flow_tunnel_action_decap_release(
>
>     dev = netdev_dpdk_cast(netdev);
>     ovs_mutex_lock(&dev->mutex);
>-    ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
>-                                               num_of_actions, error);
>+    ret = rte_flow_tunnel_action_decap_release(dev-
>>flow_transfer_proxy_port_id,
>+                                               actions, num_of_actions, error);
>     ovs_mutex_unlock(&dev->mutex);
>     return ret;
> }
>@@ -5370,8 +5417,8 @@ netdev_dpdk_rte_flow_tunnel_item_release(struct
>netdev *netdev,
>
>     dev = netdev_dpdk_cast(netdev);
>     ovs_mutex_lock(&dev->mutex);
>-    ret = rte_flow_tunnel_item_release(dev->port_id, items, num_of_items,
>-                                       error);
>+    ret = rte_flow_tunnel_item_release(dev->flow_transfer_proxy_port_id,
>+                                       items, num_of_items, error);
>     ovs_mutex_unlock(&dev->mutex);
>     return ret;
> }
>diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
>index 699be3fb4..1dd5953a4 100644
>--- a/lib/netdev-dpdk.h
>+++ b/lib/netdev-dpdk.h
>@@ -35,7 +35,7 @@ bool netdev_dpdk_flow_api_supported(struct netdev *);
>
> int
> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>-                             struct rte_flow *rte_flow,
>+                             bool transfer, struct rte_flow *rte_flow,
>                              struct rte_flow_error *error);
> struct rte_flow *
> netdev_dpdk_rte_flow_create(struct netdev *netdev,
>diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
>index 9cd5a0159..f8b90cbf7 100644
>--- a/lib/netdev-offload-dpdk.c
>+++ b/lib/netdev-offload-dpdk.c
>@@ -353,8 +353,23 @@ dump_flow_pattern(struct ds *s,
>
>     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
>         ds_put_cstr(s, "end ");
>+#ifdef ALLOW_EXPERIMENTAL_API
>+    } else if (item->type == RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
>+        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
>+        const struct rte_flow_item_ethdev *ethdev_mask = item->mask;
>+
>+        ds_put_cstr(s, "represented_port ");
>+
>+        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false, "ethdev_port_id",
>+                          "%"PRIu16, ethdev_spec->port_id,
>+                          ethdev_mask->port_id, 0);
>+    } else if (flow_patterns->tnl_pmd_items_cnt &&
>+               pattern_index < 1 /* REPRESENTED_PORT */ +
>+                               flow_patterns->tnl_pmd_items_cnt) {
>+#else /* ! ALLOW_EXPERIMENTAL_API */
>     } else if (flow_patterns->tnl_pmd_items_cnt &&
>                pattern_index < flow_patterns->tnl_pmd_items_cnt) {
>+#endif /* ALLOW_EXPERIMENTAL_API */
>         return;
>     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>         const struct rte_flow_item_eth *eth_spec = item->spec;
>@@ -1035,6 +1050,12 @@ free_flow_patterns(struct flow_patterns
>*patterns)
>     struct rte_flow_error error;
>     int i;
>
>+#ifdef ALLOW_EXPERIMENTAL_API
>+    /* REPRESENTED_PORT */
>+    free(CONST_CAST(void *, patterns->items[0].spec));
>+    free(CONST_CAST(void *, patterns->items[0].mask));
>+#endif /* ALLOW_EXPERIMENTAL_API */
>+
>     if (patterns->tnl_pmd_items) {
>         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
>         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt;
>@@ -1049,7 +1070,12 @@ free_flow_patterns(struct flow_patterns
>*patterns)
>         }
>     }
>
>+#ifdef ALLOW_EXPERIMENTAL_API
>+    for (i = 1 /* REPRESENTED_PORT */ + patterns->tnl_pmd_items_cnt;
>+         i < patterns->cnt; i++) {
>+#else /* ! ALLOW_EXPERIMENTAL_API */
>     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++) {
>+#endif /* ALLOW_EXPERIMENTAL_API */
>         if (patterns->items[i].spec) {
>             free(CONST_CAST(void *, patterns->items[i].spec));
>         }
>@@ -1383,10 +1409,23 @@ parse_flow_match(struct netdev *netdev,
>                  struct flow_patterns *patterns,
>                  struct match *match)
> {
>+#ifdef ALLOW_EXPERIMENTAL_API
>+    struct netdev *physdev = netdev_ports_get(orig_in_port, netdev-
>>dpif_type);
>+    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof *ethdev_spec);
>+    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof
>*ethdev_mask);
>+#endif /* ALLOW_EXPERIMENTAL_API */
>     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
>     struct flow *consumed_masks;
>     uint8_t proto = 0;
>
>+#ifdef ALLOW_EXPERIMENTAL_API
>+    /* Add an explicit traffic source item to the beginning of the pattern. */
>+    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
>+    *ethdev_mask = rte_flow_item_ethdev_mask;
>+    add_flow_pattern(patterns,
>RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
>+                     ethdev_spec, ethdev_mask, NULL);
>+#endif /* ALLOW_EXPERIMENTAL_API */
>+
>     consumed_masks = &match->wc.masks;
>
>     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) {
>@@ -2333,6 +2372,7 @@ netdev_offload_dpdk_flow_destroy(struct
>ufid_to_rte_flow_data *rte_flow_data)
>     struct netdev *physdev;
>     struct netdev *netdev;
>     ovs_u128 *ufid;
>+    bool transfer;
>     int ret;
>
>     ovs_mutex_lock(&rte_flow_data->lock);
>@@ -2344,12 +2384,13 @@ netdev_offload_dpdk_flow_destroy(struct
>ufid_to_rte_flow_data *rte_flow_data)
>
>     rte_flow_data->dead = true;
>
>+    transfer = rte_flow_data->actions_offloaded;
>     rte_flow = rte_flow_data->rte_flow;
>     physdev = rte_flow_data->physdev;
>     netdev = rte_flow_data->netdev;
>     ufid = &rte_flow_data->ufid;
>
>-    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
>+    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer, rte_flow, &error);
>
>     if (ret == 0) {
>         struct netdev_offload_dpdk_data *data;
>--
>2.30.2
Ivan Malov June 7, 2022, 8:55 p.m. UTC | #3
Hi Eli,

On Wed, 1 Jun 2022, Eli Britstein wrote:

> - Missing proper handling of the testpmd syntax logging. It changes the used port according to "transfer", but the log still uses netdev_dpdk_get_port_id().

Thanks for noticing. I will see to it in the next version.

> - The usage of the "proxy" port for rte-flow implies that this proxy port is attached to OVS, otherwise it is not "started" and creation of flows will fail.

That's the way it is. If there is no proxy for a given port, then the
original port value will be used for managing flows. For vendors that
don't need the proxy, this will work. For others, it won't. That's OK.

>
>> -----Original Message-----
>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>> Sent: Monday, May 30, 2022 5:16 PM
>> To: dev@openvswitch.org
>> Cc: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets
>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>; Eli Britstein
>> <elibr@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>> <thomas@monjalon.net>; Stephen Hemminger
>> <stephen@networkplumber.org>; David Marchand
>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>> Coquelin <maxime.coquelin@redhat.com>
>> Subject: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>> mechanism
>>
>> External email: Use caution opening links or attachments
>>
>>
>> Manage "transfer" flows via the corresponding mechanism.
>> Doing so requires that the traffic source be specified explicitly, via the
>> corresponding pattern item.
>>
>> Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
>> Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>> ---
>> lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++-------
>> lib/netdev-dpdk.h         |  2 +-
>> lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
>> 3 files changed, 103 insertions(+), 15 deletions(-)
>>
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>> 45e5d26d2..d0bf4613a 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -420,6 +420,7 @@ enum dpdk_hw_ol_features {
>>
>> struct netdev_dpdk {
>>     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
>> +        dpdk_port_t flow_transfer_proxy_port_id;
>>         dpdk_port_t port_id;
>>
>>         /* If true, device was attached by rte_eth_dev_attach(). */ @@ -1115,6
>> +1116,23 @@ dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev)
>>                   DPDK_PORT_ID_FMT, dev->port_id);
>>     }
>> }
>> +
>> +static void
>> +dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev) {
>> +    int ret;
>> +
>> +    ret = rte_flow_pick_transfer_proxy(dev->port_id,
>> +                                       &dev->flow_transfer_proxy_port_id, NULL);
>> +    if (ret == 0)
>> +        return;
>> +
>> +    /*
>> +     * The PMD does not indicate the proxy port.
>> +     * It is OK to assume the proxy is unneeded.
>> +     */
>> +    dev->flow_transfer_proxy_port_id = dev->port_id; }
>> #endif /* ALLOW_EXPERIMENTAL_API */
>>
>> static int
>> @@ -1141,6 +1159,19 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>>      * Request delivery of such metadata.
>>      */
>>     dpdk_eth_dev_init_rx_metadata(dev);
>> +
>> +    /*
>> +     * Managing "transfer" flows requires that the user communicate them
>> +     * via a port which has the privilege to control the embedded switch.
>> +     * For some vendors, all ports in a given switching domain have
>> +     * this privilege. For other vendors, it's only one port.
>> +     *
>> +     * Get the proxy port ID and remember it for later use.
>> +     */
>> +    dpdk_eth_dev_init_flow_transfer_proxy(dev);
>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>> +    /* It is OK to assume the proxy is unneeded. */
>> +    dev->flow_transfer_proxy_port_id = dev->port_id;
>> #endif /* ALLOW_EXPERIMENTAL_API */
>>
>>     rte_eth_dev_info_get(dev->port_id, &info);
>> @@ -5214,13 +5245,15 @@ out:
>>
>> int
>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>> -                             struct rte_flow *rte_flow,
>> +                             bool transfer, struct rte_flow *rte_flow,
>>                              struct rte_flow_error *error)
>> {
>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>     int ret;
>>
>> -    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
>> +    ret = rte_flow_destroy(transfer ?
>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>> +                           rte_flow, error);
>>     return ret;
>> }
>>
>> @@ -5234,7 +5267,19 @@ netdev_dpdk_rte_flow_create(struct netdev
>> *netdev,
>>     struct rte_flow *flow;
>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>
>> -    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
>> +#ifdef ALLOW_EXPERIMENTAL_API
>> +    if (!attr->transfer) {
>> +        /*
>> +         * The 1st item in any pattern is a traffic source one.
>> +         * It is unnecessary in the case of non-transfer rules.
>> +         */
>> +        ++(items);
>> +    }
>> +#endif /* ALLOW_EXPERIMENTAL_API */
>> +
>> +    flow = rte_flow_create(attr->transfer ?
>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>> +                           attr, items, actions, error);
>>     return flow;
>> }
>>
>> @@ -5262,7 +5307,8 @@ netdev_dpdk_rte_flow_query_count(struct netdev
>> *netdev,
>>     }
>>
>>     dev = netdev_dpdk_cast(netdev);
>> -    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
>> +    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
>> +                         actions, query, error);
>>     return ret;
>> }
>>
>> @@ -5284,8 +5330,8 @@ netdev_dpdk_rte_flow_tunnel_decap_set(struct
>> netdev *netdev,
>>
>>     dev = netdev_dpdk_cast(netdev);
>>     ovs_mutex_lock(&dev->mutex);
>> -    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
>> -                                    num_of_actions, error);
>> +    ret = rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id,
>> tunnel,
>> +                                    actions, num_of_actions, error);
>>     ovs_mutex_unlock(&dev->mutex);
>>     return ret;
>> }
>> @@ -5306,8 +5352,8 @@ netdev_dpdk_rte_flow_tunnel_match(struct
>> netdev *netdev,
>>
>>     dev = netdev_dpdk_cast(netdev);
>>     ovs_mutex_lock(&dev->mutex);
>> -    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items, num_of_items,
>> -                                error);
>> +    ret = rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id, tunnel,
>> +                                items, num_of_items, error);
>>     ovs_mutex_unlock(&dev->mutex);
>>     return ret;
>> }
>> @@ -5328,7 +5374,8 @@ netdev_dpdk_rte_flow_get_restore_info(struct
>> netdev *netdev,
>>
>>     dev = netdev_dpdk_cast(netdev);
>>     ovs_mutex_lock(&dev->mutex);
>> -    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
>> +    ret = rte_flow_get_restore_info(dev->flow_transfer_proxy_port_id,
>> +                                    m, info, error);
>>     ovs_mutex_unlock(&dev->mutex);
>>     return ret;
>> }
>> @@ -5349,8 +5396,8 @@
>> netdev_dpdk_rte_flow_tunnel_action_decap_release(
>>
>>     dev = netdev_dpdk_cast(netdev);
>>     ovs_mutex_lock(&dev->mutex);
>> -    ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
>> -                                               num_of_actions, error);
>> +    ret = rte_flow_tunnel_action_decap_release(dev-
>>> flow_transfer_proxy_port_id,
>> +                                               actions, num_of_actions, error);
>>     ovs_mutex_unlock(&dev->mutex);
>>     return ret;
>> }
>> @@ -5370,8 +5417,8 @@ netdev_dpdk_rte_flow_tunnel_item_release(struct
>> netdev *netdev,
>>
>>     dev = netdev_dpdk_cast(netdev);
>>     ovs_mutex_lock(&dev->mutex);
>> -    ret = rte_flow_tunnel_item_release(dev->port_id, items, num_of_items,
>> -                                       error);
>> +    ret = rte_flow_tunnel_item_release(dev->flow_transfer_proxy_port_id,
>> +                                       items, num_of_items, error);
>>     ovs_mutex_unlock(&dev->mutex);
>>     return ret;
>> }
>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
>> index 699be3fb4..1dd5953a4 100644
>> --- a/lib/netdev-dpdk.h
>> +++ b/lib/netdev-dpdk.h
>> @@ -35,7 +35,7 @@ bool netdev_dpdk_flow_api_supported(struct netdev *);
>>
>> int
>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>> -                             struct rte_flow *rte_flow,
>> +                             bool transfer, struct rte_flow *rte_flow,
>>                              struct rte_flow_error *error);
>> struct rte_flow *
>> netdev_dpdk_rte_flow_create(struct netdev *netdev,
>> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
>> index 9cd5a0159..f8b90cbf7 100644
>> --- a/lib/netdev-offload-dpdk.c
>> +++ b/lib/netdev-offload-dpdk.c
>> @@ -353,8 +353,23 @@ dump_flow_pattern(struct ds *s,
>>
>>     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
>>         ds_put_cstr(s, "end ");
>> +#ifdef ALLOW_EXPERIMENTAL_API
>> +    } else if (item->type == RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
>> +        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
>> +        const struct rte_flow_item_ethdev *ethdev_mask = item->mask;
>> +
>> +        ds_put_cstr(s, "represented_port ");
>> +
>> +        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false, "ethdev_port_id",
>> +                          "%"PRIu16, ethdev_spec->port_id,
>> +                          ethdev_mask->port_id, 0);
>> +    } else if (flow_patterns->tnl_pmd_items_cnt &&
>> +               pattern_index < 1 /* REPRESENTED_PORT */ +
>> +                               flow_patterns->tnl_pmd_items_cnt) {
>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>     } else if (flow_patterns->tnl_pmd_items_cnt &&
>>                pattern_index < flow_patterns->tnl_pmd_items_cnt) {
>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>         return;
>>     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>>         const struct rte_flow_item_eth *eth_spec = item->spec;
>> @@ -1035,6 +1050,12 @@ free_flow_patterns(struct flow_patterns
>> *patterns)
>>     struct rte_flow_error error;
>>     int i;
>>
>> +#ifdef ALLOW_EXPERIMENTAL_API
>> +    /* REPRESENTED_PORT */
>> +    free(CONST_CAST(void *, patterns->items[0].spec));
>> +    free(CONST_CAST(void *, patterns->items[0].mask));
>> +#endif /* ALLOW_EXPERIMENTAL_API */
>> +
>>     if (patterns->tnl_pmd_items) {
>>         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
>>         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt;
>> @@ -1049,7 +1070,12 @@ free_flow_patterns(struct flow_patterns
>> *patterns)
>>         }
>>     }
>>
>> +#ifdef ALLOW_EXPERIMENTAL_API
>> +    for (i = 1 /* REPRESENTED_PORT */ + patterns->tnl_pmd_items_cnt;
>> +         i < patterns->cnt; i++) {
>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++) {
>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>         if (patterns->items[i].spec) {
>>             free(CONST_CAST(void *, patterns->items[i].spec));
>>         }
>> @@ -1383,10 +1409,23 @@ parse_flow_match(struct netdev *netdev,
>>                  struct flow_patterns *patterns,
>>                  struct match *match)
>> {
>> +#ifdef ALLOW_EXPERIMENTAL_API
>> +    struct netdev *physdev = netdev_ports_get(orig_in_port, netdev-
>>> dpif_type);
>> +    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof *ethdev_spec);
>> +    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof
>> *ethdev_mask);
>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
>>     struct flow *consumed_masks;
>>     uint8_t proto = 0;
>>
>> +#ifdef ALLOW_EXPERIMENTAL_API
>> +    /* Add an explicit traffic source item to the beginning of the pattern. */
>> +    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
>> +    *ethdev_mask = rte_flow_item_ethdev_mask;
>> +    add_flow_pattern(patterns,
>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
>> +                     ethdev_spec, ethdev_mask, NULL);
>> +#endif /* ALLOW_EXPERIMENTAL_API */
>> +
>>     consumed_masks = &match->wc.masks;
>>
>>     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) {
>> @@ -2333,6 +2372,7 @@ netdev_offload_dpdk_flow_destroy(struct
>> ufid_to_rte_flow_data *rte_flow_data)
>>     struct netdev *physdev;
>>     struct netdev *netdev;
>>     ovs_u128 *ufid;
>> +    bool transfer;
>>     int ret;
>>
>>     ovs_mutex_lock(&rte_flow_data->lock);
>> @@ -2344,12 +2384,13 @@ netdev_offload_dpdk_flow_destroy(struct
>> ufid_to_rte_flow_data *rte_flow_data)
>>
>>     rte_flow_data->dead = true;
>>
>> +    transfer = rte_flow_data->actions_offloaded;
>>     rte_flow = rte_flow_data->rte_flow;
>>     physdev = rte_flow_data->physdev;
>>     netdev = rte_flow_data->netdev;
>>     ufid = &rte_flow_data->ufid;
>>
>> -    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
>> +    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer, rte_flow, &error);
>>
>>     if (ret == 0) {
>>         struct netdev_offload_dpdk_data *data;
>> --
>> 2.30.2
>
>
Eli Britstein June 8, 2022, 8:13 a.m. UTC | #4
Hi Ivan,

>-----Original Message-----
>From: Ivan Malov <ivan.malov@oktetlabs.ru>
>Sent: Tuesday, June 7, 2022 11:56 PM
>To: Eli Britstein <elibr@nvidia.com>
>Cc: dev@openvswitch.org; Andrew Rybchenko
><andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
><thomas@monjalon.net>; Stephen Hemminger
><stephen@networkplumber.org>; David Marchand
><david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>Coquelin <maxime.coquelin@redhat.com>
>Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>mechanism
>
>External email: Use caution opening links or attachments
>
>
>Hi Eli,
>
>On Wed, 1 Jun 2022, Eli Britstein wrote:
>
>> - Missing proper handling of the testpmd syntax logging. It changes the used
>port according to "transfer", but the log still uses netdev_dpdk_get_port_id().
>
>Thanks for noticing. I will see to it in the next version.
>
>> - The usage of the "proxy" port for rte-flow implies that this proxy port is
>attached to OVS, otherwise it is not "started" and creation of flows will fail.
>
>That's the way it is. If there is no proxy for a given port, then the original port
>value will be used for managing flows. For vendors that don't need the proxy,
>this will work. For others, it won't. That's OK.
I don't really understand why this can't be done inside dpdk domain (if there is a proxy, and it is up, use it, otherwise don't).
That's *currently* the way it is. I understand that if dpdk works like this OVS should align, but maybe you or someone else here knows why dpdk works like this? (not too late to change, this is experimental...).
>
>>
>>> -----Original Message-----
>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>> Sent: Monday, May 30, 2022 5:16 PM
>>> To: dev@openvswitch.org
>>> Cc: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets
>>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>; Eli Britstein
>>> <elibr@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>> <thomas@monjalon.net>; Stephen Hemminger
>>> <stephen@networkplumber.org>; David Marchand
>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>>> Coquelin <maxime.coquelin@redhat.com>
>>> Subject: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>> mechanism
>>>
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> Manage "transfer" flows via the corresponding mechanism.
>>> Doing so requires that the traffic source be specified explicitly,
>>> via the corresponding pattern item.
>>>
>>> Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
>>> Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>>> ---
>>> lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++-------
>>> lib/netdev-dpdk.h         |  2 +-
>>> lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
>>> 3 files changed, 103 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>>> 45e5d26d2..d0bf4613a 100644
>>> --- a/lib/netdev-dpdk.c
>>> +++ b/lib/netdev-dpdk.c
>>> @@ -420,6 +420,7 @@ enum dpdk_hw_ol_features {
>>>
>>> struct netdev_dpdk {
>>>     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE,
>cacheline0,
>>> +        dpdk_port_t flow_transfer_proxy_port_id;
>>>         dpdk_port_t port_id;
>>>
>>>         /* If true, device was attached by rte_eth_dev_attach(). */
>>> @@ -1115,6
>>> +1116,23 @@ dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev)
>>>                   DPDK_PORT_ID_FMT, dev->port_id);
>>>     }
>>> }
>>> +
>>> +static void
>>> +dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev) {
>>> +    int ret;
>>> +
>>> +    ret = rte_flow_pick_transfer_proxy(dev->port_id,
>>> +                                       &dev->flow_transfer_proxy_port_id, NULL);
>>> +    if (ret == 0)
>>> +        return;
>>> +
>>> +    /*
>>> +     * The PMD does not indicate the proxy port.
>>> +     * It is OK to assume the proxy is unneeded.
>>> +     */
>>> +    dev->flow_transfer_proxy_port_id = dev->port_id; }
>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>
>>> static int
>>> @@ -1141,6 +1159,19 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>>>      * Request delivery of such metadata.
>>>      */
>>>     dpdk_eth_dev_init_rx_metadata(dev);
>>> +
>>> +    /*
>>> +     * Managing "transfer" flows requires that the user communicate them
>>> +     * via a port which has the privilege to control the embedded switch.
>>> +     * For some vendors, all ports in a given switching domain have
>>> +     * this privilege. For other vendors, it's only one port.
>>> +     *
>>> +     * Get the proxy port ID and remember it for later use.
>>> +     */
>>> +    dpdk_eth_dev_init_flow_transfer_proxy(dev);
>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>> +    /* It is OK to assume the proxy is unneeded. */
>>> +    dev->flow_transfer_proxy_port_id = dev->port_id;
>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>
>>>     rte_eth_dev_info_get(dev->port_id, &info); @@ -5214,13 +5245,15
>>> @@ out:
>>>
>>> int
>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>> -                             struct rte_flow *rte_flow,
>>> +                             bool transfer, struct rte_flow
>>> + *rte_flow,
>>>                              struct rte_flow_error *error) {
>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>     int ret;
>>>
>>> -    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
>>> +    ret = rte_flow_destroy(transfer ?
>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>> +                           rte_flow, error);
>>>     return ret;
>>> }
>>>
>>> @@ -5234,7 +5267,19 @@ netdev_dpdk_rte_flow_create(struct netdev
>>> *netdev,
>>>     struct rte_flow *flow;
>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>
>>> -    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>> +    if (!attr->transfer) {
>>> +        /*
>>> +         * The 1st item in any pattern is a traffic source one.
>>> +         * It is unnecessary in the case of non-transfer rules.
>>> +         */
>>> +        ++(items);
>>> +    }
>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>> +
>>> +    flow = rte_flow_create(attr->transfer ?
>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>> +                           attr, items, actions, error);
>>>     return flow;
>>> }
>>>
>>> @@ -5262,7 +5307,8 @@ netdev_dpdk_rte_flow_query_count(struct
>netdev
>>> *netdev,
>>>     }
>>>
>>>     dev = netdev_dpdk_cast(netdev);
>>> -    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
>>> +    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
>>> +                         actions, query, error);
>>>     return ret;
>>> }
>>>
>>> @@ -5284,8 +5330,8 @@ netdev_dpdk_rte_flow_tunnel_decap_set(struct
>>> netdev *netdev,
>>>
>>>     dev = netdev_dpdk_cast(netdev);
>>>     ovs_mutex_lock(&dev->mutex);
>>> -    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
>>> -                                    num_of_actions, error);
>>> +    ret =
>>> + rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id,
>>> tunnel,
>>> +                                    actions, num_of_actions, error);
>>>     ovs_mutex_unlock(&dev->mutex);
>>>     return ret;
>>> }
>>> @@ -5306,8 +5352,8 @@ netdev_dpdk_rte_flow_tunnel_match(struct
>>> netdev *netdev,
>>>
>>>     dev = netdev_dpdk_cast(netdev);
>>>     ovs_mutex_lock(&dev->mutex);
>>> -    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items,
>num_of_items,
>>> -                                error);
>>> +    ret = rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id,
>tunnel,
>>> +                                items, num_of_items, error);
>>>     ovs_mutex_unlock(&dev->mutex);
>>>     return ret;
>>> }
>>> @@ -5328,7 +5374,8 @@ netdev_dpdk_rte_flow_get_restore_info(struct
>>> netdev *netdev,
>>>
>>>     dev = netdev_dpdk_cast(netdev);
>>>     ovs_mutex_lock(&dev->mutex);
>>> -    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
>>> +    ret = rte_flow_get_restore_info(dev->flow_transfer_proxy_port_id,
>>> +                                    m, info, error);
>>>     ovs_mutex_unlock(&dev->mutex);
>>>     return ret;
>>> }
>>> @@ -5349,8 +5396,8 @@
>>> netdev_dpdk_rte_flow_tunnel_action_decap_release(
>>>
>>>     dev = netdev_dpdk_cast(netdev);
>>>     ovs_mutex_lock(&dev->mutex);
>>> -    ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
>>> -                                               num_of_actions, error);
>>> +    ret = rte_flow_tunnel_action_decap_release(dev-
>>>> flow_transfer_proxy_port_id,
>>> +                                               actions,
>>> + num_of_actions, error);
>>>     ovs_mutex_unlock(&dev->mutex);
>>>     return ret;
>>> }
>>> @@ -5370,8 +5417,8 @@
>netdev_dpdk_rte_flow_tunnel_item_release(struct
>>> netdev *netdev,
>>>
>>>     dev = netdev_dpdk_cast(netdev);
>>>     ovs_mutex_lock(&dev->mutex);
>>> -    ret = rte_flow_tunnel_item_release(dev->port_id, items,
>num_of_items,
>>> -                                       error);
>>> +    ret = rte_flow_tunnel_item_release(dev->flow_transfer_proxy_port_id,
>>> +                                       items, num_of_items, error);
>>>     ovs_mutex_unlock(&dev->mutex);
>>>     return ret;
>>> }
>>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index
>>> 699be3fb4..1dd5953a4 100644
>>> --- a/lib/netdev-dpdk.h
>>> +++ b/lib/netdev-dpdk.h
>>> @@ -35,7 +35,7 @@ bool netdev_dpdk_flow_api_supported(struct netdev
>>> *);
>>>
>>> int
>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>> -                             struct rte_flow *rte_flow,
>>> +                             bool transfer, struct rte_flow
>>> + *rte_flow,
>>>                              struct rte_flow_error *error); struct
>>> rte_flow * netdev_dpdk_rte_flow_create(struct netdev *netdev, diff
>>> --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index
>>> 9cd5a0159..f8b90cbf7 100644
>>> --- a/lib/netdev-offload-dpdk.c
>>> +++ b/lib/netdev-offload-dpdk.c
>>> @@ -353,8 +353,23 @@ dump_flow_pattern(struct ds *s,
>>>
>>>     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
>>>         ds_put_cstr(s, "end ");
>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>> +    } else if (item->type == RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
>>> +        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
>>> +        const struct rte_flow_item_ethdev *ethdev_mask = item->mask;
>>> +
>>> +        ds_put_cstr(s, "represented_port ");
>>> +
>>> +        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false,
>"ethdev_port_id",
>>> +                          "%"PRIu16, ethdev_spec->port_id,
>>> +                          ethdev_mask->port_id, 0);
>>> +    } else if (flow_patterns->tnl_pmd_items_cnt &&
>>> +               pattern_index < 1 /* REPRESENTED_PORT */ +
>>> +                               flow_patterns->tnl_pmd_items_cnt) {
>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>     } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>                pattern_index < flow_patterns->tnl_pmd_items_cnt) {
>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>         return;
>>>     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>>>         const struct rte_flow_item_eth *eth_spec = item->spec; @@
>>> -1035,6 +1050,12 @@ free_flow_patterns(struct flow_patterns
>>> *patterns)
>>>     struct rte_flow_error error;
>>>     int i;
>>>
>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>> +    /* REPRESENTED_PORT */
>>> +    free(CONST_CAST(void *, patterns->items[0].spec));
>>> +    free(CONST_CAST(void *, patterns->items[0].mask)); #endif /*
>>> +ALLOW_EXPERIMENTAL_API */
>>> +
>>>     if (patterns->tnl_pmd_items) {
>>>         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
>>>         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt; @@
>>> -1049,7 +1070,12 @@ free_flow_patterns(struct flow_patterns
>>> *patterns)
>>>         }
>>>     }
>>>
>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>> +    for (i = 1 /* REPRESENTED_PORT */ + patterns->tnl_pmd_items_cnt;
>>> +         i < patterns->cnt; i++) {
>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++) {
>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>         if (patterns->items[i].spec) {
>>>             free(CONST_CAST(void *, patterns->items[i].spec));
>>>         }
>>> @@ -1383,10 +1409,23 @@ parse_flow_match(struct netdev *netdev,
>>>                  struct flow_patterns *patterns,
>>>                  struct match *match) {
>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>> +    struct netdev *physdev = netdev_ports_get(orig_in_port, netdev-
>>>> dpif_type);
>>> +    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof
>*ethdev_spec);
>>> +    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof
>>> *ethdev_mask);
>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
>>>     struct flow *consumed_masks;
>>>     uint8_t proto = 0;
>>>
>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>> +    /* Add an explicit traffic source item to the beginning of the pattern. */
>>> +    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
>>> +    *ethdev_mask = rte_flow_item_ethdev_mask;
>>> +    add_flow_pattern(patterns,
>>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
>>> +                     ethdev_spec, ethdev_mask, NULL); #endif /*
>>> +ALLOW_EXPERIMENTAL_API */
>>> +
>>>     consumed_masks = &match->wc.masks;
>>>
>>>     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) { @@ -2333,6
>>> +2372,7 @@ netdev_offload_dpdk_flow_destroy(struct
>>> ufid_to_rte_flow_data *rte_flow_data)
>>>     struct netdev *physdev;
>>>     struct netdev *netdev;
>>>     ovs_u128 *ufid;
>>> +    bool transfer;
>>>     int ret;
>>>
>>>     ovs_mutex_lock(&rte_flow_data->lock);
>>> @@ -2344,12 +2384,13 @@ netdev_offload_dpdk_flow_destroy(struct
>>> ufid_to_rte_flow_data *rte_flow_data)
>>>
>>>     rte_flow_data->dead = true;
>>>
>>> +    transfer = rte_flow_data->actions_offloaded;
>>>     rte_flow = rte_flow_data->rte_flow;
>>>     physdev = rte_flow_data->physdev;
>>>     netdev = rte_flow_data->netdev;
>>>     ufid = &rte_flow_data->ufid;
>>>
>>> -    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
>>> +    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer, rte_flow,
>>> + &error);
>>>
>>>     if (ret == 0) {
>>>         struct netdev_offload_dpdk_data *data;
>>> --
>>> 2.30.2
>>
>>
Ivan Malov June 8, 2022, 2:46 p.m. UTC | #5
Hi Eli,

On Wed, 8 Jun 2022, Eli Britstein wrote:

> Hi Ivan,
>
>> -----Original Message-----
>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>> Sent: Tuesday, June 7, 2022 11:56 PM
>> To: Eli Britstein <elibr@nvidia.com>
>> Cc: dev@openvswitch.org; Andrew Rybchenko
>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>> Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>> <thomas@monjalon.net>; Stephen Hemminger
>> <stephen@networkplumber.org>; David Marchand
>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>> Coquelin <maxime.coquelin@redhat.com>
>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>> mechanism
>>
>> External email: Use caution opening links or attachments
>>
>>
>> Hi Eli,
>>
>> On Wed, 1 Jun 2022, Eli Britstein wrote:
>>
>>> - Missing proper handling of the testpmd syntax logging. It changes the used
>> port according to "transfer", but the log still uses netdev_dpdk_get_port_id().
>>
>> Thanks for noticing. I will see to it in the next version.
>>
>>> - The usage of the "proxy" port for rte-flow implies that this proxy port is
>> attached to OVS, otherwise it is not "started" and creation of flows will fail.
>>
>> That's the way it is. If there is no proxy for a given port, then the original port
>> value will be used for managing flows. For vendors that don't need the proxy,
>> this will work. For others, it won't. That's OK.

> I don't really understand why this can't be done inside dpdk domain (if there is a proxy, and it is up, use it, otherwise don't).
> That's *currently* the way it is. I understand that if dpdk works like this OVS should align, but maybe you or someone else here knows why dpdk works like this? (not too late to change, this is experimental...).


Regardless of DPDK, on some NICs, it is possible to insert rules via
unprivileged PFs or VFs, but there are also NICs which cannot do it.

In DPDK, this contradiction has to be resolved somehow.
In example, for NICs that can only manage flows via
privileged ports, two possible solutions exist:

1. Route flow management requests from unprivileged ethdevs
    to the privileged one implicitly, inside the PMD. This
    is transparent to users, but, at the same time, it is
    tricky because the application does not realise that
    flows it manages via an ethdev "B" are in fact
    communicated to the NIC via an ethdev "A".

    Unbeknownst of the implicit scheme, the application may
    detach the privileged ethdev "A" in-between. And, when
    time comes to remove flows, doing so via ethdev "B"
    will fail. This scheme breaks in-app housekeeping.

2. Expose the "proxy" port existence to the application.
    If it knows the truth about the real ethdev that
    handles the transfer flows, it won't attempt to
    detach it in-between. The housekeeping is fine.

Outing the existence of the "proxy" port to users seems
like the most reasonable approach. This is why it was
implemented in DPDK like this. Currently, it's indeed
an experimental feature. DPDK PMDs which need it, are
supposed to switch to it during the transition phase.

However, I should stress out that to NICs that support
managing transfer flows on any PFs and VFs, this proxy
scheme is a don't care. The corresponding drivers may
not implement the proxy query method at all:

https://github.com/DPDK/dpdk/blob/main/lib/ethdev/rte_flow.c#L1345

The generic part of the API will just return
the original port ID to the application.


>>
>>>
>>>> -----Original Message-----
>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>> Sent: Monday, May 30, 2022 5:16 PM
>>>> To: dev@openvswitch.org
>>>> Cc: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets
>>>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>; Eli Britstein
>>>> <elibr@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>>> <thomas@monjalon.net>; Stephen Hemminger
>>>> <stephen@networkplumber.org>; David Marchand
>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>>>> Coquelin <maxime.coquelin@redhat.com>
>>>> Subject: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>>> mechanism
>>>>
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> Manage "transfer" flows via the corresponding mechanism.
>>>> Doing so requires that the traffic source be specified explicitly,
>>>> via the corresponding pattern item.
>>>>
>>>> Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>> Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>>>> ---
>>>> lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++-------
>>>> lib/netdev-dpdk.h         |  2 +-
>>>> lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
>>>> 3 files changed, 103 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>>>> 45e5d26d2..d0bf4613a 100644
>>>> --- a/lib/netdev-dpdk.c
>>>> +++ b/lib/netdev-dpdk.c
>>>> @@ -420,6 +420,7 @@ enum dpdk_hw_ol_features {
>>>>
>>>> struct netdev_dpdk {
>>>>     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE,
>> cacheline0,
>>>> +        dpdk_port_t flow_transfer_proxy_port_id;
>>>>         dpdk_port_t port_id;
>>>>
>>>>         /* If true, device was attached by rte_eth_dev_attach(). */
>>>> @@ -1115,6
>>>> +1116,23 @@ dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev)
>>>>                   DPDK_PORT_ID_FMT, dev->port_id);
>>>>     }
>>>> }
>>>> +
>>>> +static void
>>>> +dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev) {
>>>> +    int ret;
>>>> +
>>>> +    ret = rte_flow_pick_transfer_proxy(dev->port_id,
>>>> +                                       &dev->flow_transfer_proxy_port_id, NULL);
>>>> +    if (ret == 0)
>>>> +        return;
>>>> +
>>>> +    /*
>>>> +     * The PMD does not indicate the proxy port.
>>>> +     * It is OK to assume the proxy is unneeded.
>>>> +     */
>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id; }
>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>
>>>> static int
>>>> @@ -1141,6 +1159,19 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>>>>      * Request delivery of such metadata.
>>>>      */
>>>>     dpdk_eth_dev_init_rx_metadata(dev);
>>>> +
>>>> +    /*
>>>> +     * Managing "transfer" flows requires that the user communicate them
>>>> +     * via a port which has the privilege to control the embedded switch.
>>>> +     * For some vendors, all ports in a given switching domain have
>>>> +     * this privilege. For other vendors, it's only one port.
>>>> +     *
>>>> +     * Get the proxy port ID and remember it for later use.
>>>> +     */
>>>> +    dpdk_eth_dev_init_flow_transfer_proxy(dev);
>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>> +    /* It is OK to assume the proxy is unneeded. */
>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id;
>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>
>>>>     rte_eth_dev_info_get(dev->port_id, &info); @@ -5214,13 +5245,15
>>>> @@ out:
>>>>
>>>> int
>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>> -                             struct rte_flow *rte_flow,
>>>> +                             bool transfer, struct rte_flow
>>>> + *rte_flow,
>>>>                              struct rte_flow_error *error) {
>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>     int ret;
>>>>
>>>> -    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
>>>> +    ret = rte_flow_destroy(transfer ?
>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>> +                           rte_flow, error);
>>>>     return ret;
>>>> }
>>>>
>>>> @@ -5234,7 +5267,19 @@ netdev_dpdk_rte_flow_create(struct netdev
>>>> *netdev,
>>>>     struct rte_flow *flow;
>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>
>>>> -    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>> +    if (!attr->transfer) {
>>>> +        /*
>>>> +         * The 1st item in any pattern is a traffic source one.
>>>> +         * It is unnecessary in the case of non-transfer rules.
>>>> +         */
>>>> +        ++(items);
>>>> +    }
>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>> +
>>>> +    flow = rte_flow_create(attr->transfer ?
>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>> +                           attr, items, actions, error);
>>>>     return flow;
>>>> }
>>>>
>>>> @@ -5262,7 +5307,8 @@ netdev_dpdk_rte_flow_query_count(struct
>> netdev
>>>> *netdev,
>>>>     }
>>>>
>>>>     dev = netdev_dpdk_cast(netdev);
>>>> -    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
>>>> +    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
>>>> +                         actions, query, error);
>>>>     return ret;
>>>> }
>>>>
>>>> @@ -5284,8 +5330,8 @@ netdev_dpdk_rte_flow_tunnel_decap_set(struct
>>>> netdev *netdev,
>>>>
>>>>     dev = netdev_dpdk_cast(netdev);
>>>>     ovs_mutex_lock(&dev->mutex);
>>>> -    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
>>>> -                                    num_of_actions, error);
>>>> +    ret =
>>>> + rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id,
>>>> tunnel,
>>>> +                                    actions, num_of_actions, error);
>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>     return ret;
>>>> }
>>>> @@ -5306,8 +5352,8 @@ netdev_dpdk_rte_flow_tunnel_match(struct
>>>> netdev *netdev,
>>>>
>>>>     dev = netdev_dpdk_cast(netdev);
>>>>     ovs_mutex_lock(&dev->mutex);
>>>> -    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items,
>> num_of_items,
>>>> -                                error);
>>>> +    ret = rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id,
>> tunnel,
>>>> +                                items, num_of_items, error);
>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>     return ret;
>>>> }
>>>> @@ -5328,7 +5374,8 @@ netdev_dpdk_rte_flow_get_restore_info(struct
>>>> netdev *netdev,
>>>>
>>>>     dev = netdev_dpdk_cast(netdev);
>>>>     ovs_mutex_lock(&dev->mutex);
>>>> -    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
>>>> +    ret = rte_flow_get_restore_info(dev->flow_transfer_proxy_port_id,
>>>> +                                    m, info, error);
>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>     return ret;
>>>> }
>>>> @@ -5349,8 +5396,8 @@
>>>> netdev_dpdk_rte_flow_tunnel_action_decap_release(
>>>>
>>>>     dev = netdev_dpdk_cast(netdev);
>>>>     ovs_mutex_lock(&dev->mutex);
>>>> -    ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
>>>> -                                               num_of_actions, error);
>>>> +    ret = rte_flow_tunnel_action_decap_release(dev-
>>>>> flow_transfer_proxy_port_id,
>>>> +                                               actions,
>>>> + num_of_actions, error);
>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>     return ret;
>>>> }
>>>> @@ -5370,8 +5417,8 @@
>> netdev_dpdk_rte_flow_tunnel_item_release(struct
>>>> netdev *netdev,
>>>>
>>>>     dev = netdev_dpdk_cast(netdev);
>>>>     ovs_mutex_lock(&dev->mutex);
>>>> -    ret = rte_flow_tunnel_item_release(dev->port_id, items,
>> num_of_items,
>>>> -                                       error);
>>>> +    ret = rte_flow_tunnel_item_release(dev->flow_transfer_proxy_port_id,
>>>> +                                       items, num_of_items, error);
>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>     return ret;
>>>> }
>>>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index
>>>> 699be3fb4..1dd5953a4 100644
>>>> --- a/lib/netdev-dpdk.h
>>>> +++ b/lib/netdev-dpdk.h
>>>> @@ -35,7 +35,7 @@ bool netdev_dpdk_flow_api_supported(struct netdev
>>>> *);
>>>>
>>>> int
>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>> -                             struct rte_flow *rte_flow,
>>>> +                             bool transfer, struct rte_flow
>>>> + *rte_flow,
>>>>                              struct rte_flow_error *error); struct
>>>> rte_flow * netdev_dpdk_rte_flow_create(struct netdev *netdev, diff
>>>> --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index
>>>> 9cd5a0159..f8b90cbf7 100644
>>>> --- a/lib/netdev-offload-dpdk.c
>>>> +++ b/lib/netdev-offload-dpdk.c
>>>> @@ -353,8 +353,23 @@ dump_flow_pattern(struct ds *s,
>>>>
>>>>     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
>>>>         ds_put_cstr(s, "end ");
>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>> +    } else if (item->type == RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
>>>> +        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
>>>> +        const struct rte_flow_item_ethdev *ethdev_mask = item->mask;
>>>> +
>>>> +        ds_put_cstr(s, "represented_port ");
>>>> +
>>>> +        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false,
>> "ethdev_port_id",
>>>> +                          "%"PRIu16, ethdev_spec->port_id,
>>>> +                          ethdev_mask->port_id, 0);
>>>> +    } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>> +               pattern_index < 1 /* REPRESENTED_PORT */ +
>>>> +                               flow_patterns->tnl_pmd_items_cnt) {
>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>>     } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>                pattern_index < flow_patterns->tnl_pmd_items_cnt) {
>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>         return;
>>>>     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>>>>         const struct rte_flow_item_eth *eth_spec = item->spec; @@
>>>> -1035,6 +1050,12 @@ free_flow_patterns(struct flow_patterns
>>>> *patterns)
>>>>     struct rte_flow_error error;
>>>>     int i;
>>>>
>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>> +    /* REPRESENTED_PORT */
>>>> +    free(CONST_CAST(void *, patterns->items[0].spec));
>>>> +    free(CONST_CAST(void *, patterns->items[0].mask)); #endif /*
>>>> +ALLOW_EXPERIMENTAL_API */
>>>> +
>>>>     if (patterns->tnl_pmd_items) {
>>>>         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
>>>>         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt; @@
>>>> -1049,7 +1070,12 @@ free_flow_patterns(struct flow_patterns
>>>> *patterns)
>>>>         }
>>>>     }
>>>>
>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>> +    for (i = 1 /* REPRESENTED_PORT */ + patterns->tnl_pmd_items_cnt;
>>>> +         i < patterns->cnt; i++) {
>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>>     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++) {
>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>         if (patterns->items[i].spec) {
>>>>             free(CONST_CAST(void *, patterns->items[i].spec));
>>>>         }
>>>> @@ -1383,10 +1409,23 @@ parse_flow_match(struct netdev *netdev,
>>>>                  struct flow_patterns *patterns,
>>>>                  struct match *match) {
>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>> +    struct netdev *physdev = netdev_ports_get(orig_in_port, netdev-
>>>>> dpif_type);
>>>> +    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof
>> *ethdev_spec);
>>>> +    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof
>>>> *ethdev_mask);
>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
>>>>     struct flow *consumed_masks;
>>>>     uint8_t proto = 0;
>>>>
>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>> +    /* Add an explicit traffic source item to the beginning of the pattern. */
>>>> +    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
>>>> +    *ethdev_mask = rte_flow_item_ethdev_mask;
>>>> +    add_flow_pattern(patterns,
>>>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
>>>> +                     ethdev_spec, ethdev_mask, NULL); #endif /*
>>>> +ALLOW_EXPERIMENTAL_API */
>>>> +
>>>>     consumed_masks = &match->wc.masks;
>>>>
>>>>     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) { @@ -2333,6
>>>> +2372,7 @@ netdev_offload_dpdk_flow_destroy(struct
>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>     struct netdev *physdev;
>>>>     struct netdev *netdev;
>>>>     ovs_u128 *ufid;
>>>> +    bool transfer;
>>>>     int ret;
>>>>
>>>>     ovs_mutex_lock(&rte_flow_data->lock);
>>>> @@ -2344,12 +2384,13 @@ netdev_offload_dpdk_flow_destroy(struct
>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>
>>>>     rte_flow_data->dead = true;
>>>>
>>>> +    transfer = rte_flow_data->actions_offloaded;
>>>>     rte_flow = rte_flow_data->rte_flow;
>>>>     physdev = rte_flow_data->physdev;
>>>>     netdev = rte_flow_data->netdev;
>>>>     ufid = &rte_flow_data->ufid;
>>>>
>>>> -    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
>>>> +    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer, rte_flow,
>>>> + &error);
>>>>
>>>>     if (ret == 0) {
>>>>         struct netdev_offload_dpdk_data *data;
>>>> --
>>>> 2.30.2
>>>
>>>
>
Eli Britstein June 8, 2022, 3:52 p.m. UTC | #6
Hi Ivan,

>-----Original Message-----
>From: Ivan Malov <ivan.malov@oktetlabs.ru>
>Sent: Wednesday, June 8, 2022 5:46 PM
>To: Eli Britstein <elibr@nvidia.com>
>Cc: dev@openvswitch.org; Andrew Rybchenko
><andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
><thomas@monjalon.net>; Stephen Hemminger
><stephen@networkplumber.org>; David Marchand
><david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>Coquelin <maxime.coquelin@redhat.com>
>Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>mechanism
>
>External email: Use caution opening links or attachments
>
>
>Hi Eli,
>
>On Wed, 8 Jun 2022, Eli Britstein wrote:
>
>> Hi Ivan,
>>
>>> -----Original Message-----
>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>> Sent: Tuesday, June 7, 2022 11:56 PM
>>> To: Eli Britstein <elibr@nvidia.com>
>>> Cc: dev@openvswitch.org; Andrew Rybchenko
>>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>>> Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>> <thomas@monjalon.net>; Stephen Hemminger
>>> <stephen@networkplumber.org>; David Marchand
>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>>> Coquelin <maxime.coquelin@redhat.com>
>>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>> mechanism
>>>
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> Hi Eli,
>>>
>>> On Wed, 1 Jun 2022, Eli Britstein wrote:
>>>
>>>> - Missing proper handling of the testpmd syntax logging. It changes
>>>> the used
>>> port according to "transfer", but the log still uses
>netdev_dpdk_get_port_id().
>>>
>>> Thanks for noticing. I will see to it in the next version.
>>>
>>>> - The usage of the "proxy" port for rte-flow implies that this proxy
>>>> port is
>>> attached to OVS, otherwise it is not "started" and creation of flows will
>fail.
>>>
>>> That's the way it is. If there is no proxy for a given port, then the
>>> original port value will be used for managing flows. For vendors that
>>> don't need the proxy, this will work. For others, it won't. That's OK.
>
>> I don't really understand why this can't be done inside dpdk domain (if there
>is a proxy, and it is up, use it, otherwise don't).
>> That's *currently* the way it is. I understand that if dpdk works like this OVS
>should align, but maybe you or someone else here knows why dpdk works like
>this? (not too late to change, this is experimental...).
>
>
>Regardless of DPDK, on some NICs, it is possible to insert rules via
>unprivileged PFs or VFs, but there are also NICs which cannot do it.
>
>In DPDK, this contradiction has to be resolved somehow.
>In example, for NICs that can only manage flows via privileged ports, two
>possible solutions exist:
>
>1. Route flow management requests from unprivileged ethdevs
>    to the privileged one implicitly, inside the PMD. This
>    is transparent to users, but, at the same time, it is
>    tricky because the application does not realise that
>    flows it manages via an ethdev "B" are in fact
>    communicated to the NIC via an ethdev "A".
>
>    Unbeknownst of the implicit scheme, the application may
>    detach the privileged ethdev "A" in-between. And, when
>    time comes to remove flows, doing so via ethdev "B"
>    will fail. This scheme breaks in-app housekeeping.
>
>2. Expose the "proxy" port existence to the application.
>    If it knows the truth about the real ethdev that
>    handles the transfer flows, it won't attempt to
>    detach it in-between. The housekeeping is fine.
>
>Outing the existence of the "proxy" port to users seems like the most
>reasonable approach. This is why it was implemented in DPDK like this.
>Currently, it's indeed an experimental feature. DPDK PMDs which need it, are
>supposed to switch to it during the transition phase.
Thanks very much for the explanation, though IMHO relevant PMDs could still hide it and not do this "outing" of their internals.
>
>However, I should stress out that to NICs that support managing transfer
>flows on any PFs and VFs, this proxy scheme is a don't care. The
>corresponding drivers may not implement the proxy query method at all:
>
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
>b.com%2FDPDK%2Fdpdk%2Fblob%2Fmain%2Flib%2Fethdev%2Frte_flow.c%2
>3L1345&amp;data=05%7C01%7Celibr%40nvidia.com%7Cf5a80eb00f0342498
>63308da495dab8b%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C6
>37902963929533013%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMD
>AiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C
>&amp;sdata=ojwUOsPlz09NXtDXfeO8lAT%2BHcgGYWNRdIhxB6f0cy0%3D&a
>mp;reserved=0
>
>The generic part of the API will just return the original port ID to the
>application.
Yes, I saw that. Thanks.
>
>
>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>> Sent: Monday, May 30, 2022 5:16 PM
>>>>> To: dev@openvswitch.org
>>>>> Cc: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>; Ilya
>Maximets
>>>>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>; Eli Britstein
>>>>> <elibr@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>>>> <thomas@monjalon.net>; Stephen Hemminger
>>>>> <stephen@networkplumber.org>; David Marchand
>>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>;
>Maxime
>>>>> Coquelin <maxime.coquelin@redhat.com>
>>>>> Subject: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>>>> mechanism
>>>>>
>>>>> External email: Use caution opening links or attachments
>>>>>
>>>>>
>>>>> Manage "transfer" flows via the corresponding mechanism.
>>>>> Doing so requires that the traffic source be specified explicitly,
>>>>> via the corresponding pattern item.
>>>>>
>>>>> Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>> Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>>>>> ---
>>>>> lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++-------
>>>>> lib/netdev-dpdk.h         |  2 +-
>>>>> lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
>>>>> 3 files changed, 103 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>>>>> 45e5d26d2..d0bf4613a 100644
>>>>> --- a/lib/netdev-dpdk.c
>>>>> +++ b/lib/netdev-dpdk.c
>>>>> @@ -420,6 +420,7 @@ enum dpdk_hw_ol_features {
>>>>>
>>>>> struct netdev_dpdk {
>>>>>     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE,
>>> cacheline0,
>>>>> +        dpdk_port_t flow_transfer_proxy_port_id;
>>>>>         dpdk_port_t port_id;
>>>>>
>>>>>         /* If true, device was attached by rte_eth_dev_attach(). */
>>>>> @@ -1115,6
>>>>> +1116,23 @@ dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk
>*dev)
>>>>>                   DPDK_PORT_ID_FMT, dev->port_id);
>>>>>     }
>>>>> }
>>>>> +
>>>>> +static void
>>>>> +dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev) {
>>>>> +    int ret;
>>>>> +
>>>>> +    ret = rte_flow_pick_transfer_proxy(dev->port_id,
>>>>> +                                       &dev->flow_transfer_proxy_port_id, NULL);
>>>>> +    if (ret == 0)
>>>>> +        return;
>>>>> +
>>>>> +    /*
>>>>> +     * The PMD does not indicate the proxy port.
>>>>> +     * It is OK to assume the proxy is unneeded.
>>>>> +     */
>>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id; }
>>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>>
>>>>> static int
>>>>> @@ -1141,6 +1159,19 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>>>>>      * Request delivery of such metadata.
>>>>>      */
>>>>>     dpdk_eth_dev_init_rx_metadata(dev);
>>>>> +
>>>>> +    /*
>>>>> +     * Managing "transfer" flows requires that the user communicate
>them
>>>>> +     * via a port which has the privilege to control the embedded switch.
>>>>> +     * For some vendors, all ports in a given switching domain have
>>>>> +     * this privilege. For other vendors, it's only one port.
>>>>> +     *
>>>>> +     * Get the proxy port ID and remember it for later use.
>>>>> +     */
>>>>> +    dpdk_eth_dev_init_flow_transfer_proxy(dev);
>>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>>> +    /* It is OK to assume the proxy is unneeded. */
>>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id;
>>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>>
>>>>>     rte_eth_dev_info_get(dev->port_id, &info); @@ -5214,13 +5245,15
>>>>> @@ out:
>>>>>
>>>>> int
>>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>>> -                             struct rte_flow *rte_flow,
>>>>> +                             bool transfer, struct rte_flow
>>>>> + *rte_flow,
>>>>>                              struct rte_flow_error *error) {
>>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>>     int ret;
>>>>>
>>>>> -    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
>>>>> +    ret = rte_flow_destroy(transfer ?
>>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>>> +                           rte_flow, error);
>>>>>     return ret;
>>>>> }
>>>>>
>>>>> @@ -5234,7 +5267,19 @@ netdev_dpdk_rte_flow_create(struct netdev
>>>>> *netdev,
>>>>>     struct rte_flow *flow;
>>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>>
>>>>> -    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>> +    if (!attr->transfer) {
>>>>> +        /*
>>>>> +         * The 1st item in any pattern is a traffic source one.
>>>>> +         * It is unnecessary in the case of non-transfer rules.
>>>>> +         */
>>>>> +        ++(items);
>>>>> +    }
>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>> +
>>>>> +    flow = rte_flow_create(attr->transfer ?
>>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>>> +                           attr, items, actions, error);
>>>>>     return flow;
>>>>> }
>>>>>
>>>>> @@ -5262,7 +5307,8 @@ netdev_dpdk_rte_flow_query_count(struct
>>> netdev
>>>>> *netdev,
>>>>>     }
>>>>>
>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>> -    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
>>>>> +    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
>>>>> +                         actions, query, error);
>>>>>     return ret;
>>>>> }
>>>>>
>>>>> @@ -5284,8 +5330,8 @@
>netdev_dpdk_rte_flow_tunnel_decap_set(struct
>>>>> netdev *netdev,
>>>>>
>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>> -    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
>>>>> -                                    num_of_actions, error);
>>>>> +    ret =
>>>>> + rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id,
>>>>> tunnel,
>>>>> +                                    actions, num_of_actions,
>>>>> + error);
>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>     return ret;
>>>>> }
>>>>> @@ -5306,8 +5352,8 @@ netdev_dpdk_rte_flow_tunnel_match(struct
>>>>> netdev *netdev,
>>>>>
>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>> -    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items,
>>> num_of_items,
>>>>> -                                error);
>>>>> +    ret = rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id,
>>> tunnel,
>>>>> +                                items, num_of_items, error);
>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>     return ret;
>>>>> }
>>>>> @@ -5328,7 +5374,8 @@
>netdev_dpdk_rte_flow_get_restore_info(struct
>>>>> netdev *netdev,
>>>>>
>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>> -    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
>>>>> +    ret = rte_flow_get_restore_info(dev->flow_transfer_proxy_port_id,
>>>>> +                                    m, info, error);
>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>     return ret;
>>>>> }
>>>>> @@ -5349,8 +5396,8 @@
>>>>> netdev_dpdk_rte_flow_tunnel_action_decap_release(
>>>>>
>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>> -    ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
>>>>> -                                               num_of_actions, error);
>>>>> +    ret = rte_flow_tunnel_action_decap_release(dev-
>>>>>> flow_transfer_proxy_port_id,
>>>>> +                                               actions,
>>>>> + num_of_actions, error);
>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>     return ret;
>>>>> }
>>>>> @@ -5370,8 +5417,8 @@
>>> netdev_dpdk_rte_flow_tunnel_item_release(struct
>>>>> netdev *netdev,
>>>>>
>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>> -    ret = rte_flow_tunnel_item_release(dev->port_id, items,
>>> num_of_items,
>>>>> -                                       error);
>>>>> +    ret = rte_flow_tunnel_item_release(dev-
>>flow_transfer_proxy_port_id,
>>>>> +                                       items, num_of_items,
>>>>> + error);
>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>     return ret;
>>>>> }
>>>>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index
>>>>> 699be3fb4..1dd5953a4 100644
>>>>> --- a/lib/netdev-dpdk.h
>>>>> +++ b/lib/netdev-dpdk.h
>>>>> @@ -35,7 +35,7 @@ bool netdev_dpdk_flow_api_supported(struct
>netdev
>>>>> *);
>>>>>
>>>>> int
>>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>>> -                             struct rte_flow *rte_flow,
>>>>> +                             bool transfer, struct rte_flow
>>>>> + *rte_flow,
>>>>>                              struct rte_flow_error *error); struct
>>>>> rte_flow * netdev_dpdk_rte_flow_create(struct netdev *netdev, diff
>>>>> --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index
>>>>> 9cd5a0159..f8b90cbf7 100644
>>>>> --- a/lib/netdev-offload-dpdk.c
>>>>> +++ b/lib/netdev-offload-dpdk.c
>>>>> @@ -353,8 +353,23 @@ dump_flow_pattern(struct ds *s,
>>>>>
>>>>>     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
>>>>>         ds_put_cstr(s, "end ");
>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>> +    } else if (item->type ==
>RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
>>>>> +        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
>>>>> +        const struct rte_flow_item_ethdev *ethdev_mask =
>>>>> +item->mask;
>>>>> +
>>>>> +        ds_put_cstr(s, "represented_port ");
>>>>> +
>>>>> +        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false,
>>> "ethdev_port_id",
>>>>> +                          "%"PRIu16, ethdev_spec->port_id,
>>>>> +                          ethdev_mask->port_id, 0);
>>>>> +    } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>> +               pattern_index < 1 /* REPRESENTED_PORT */ +
>>>>> +                               flow_patterns->tnl_pmd_items_cnt) {
>>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>>>     } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>>                pattern_index < flow_patterns->tnl_pmd_items_cnt) {
>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>         return;
>>>>>     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>>>>>         const struct rte_flow_item_eth *eth_spec = item->spec; @@
>>>>> -1035,6 +1050,12 @@ free_flow_patterns(struct flow_patterns
>>>>> *patterns)
>>>>>     struct rte_flow_error error;
>>>>>     int i;
>>>>>
>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>> +    /* REPRESENTED_PORT */
>>>>> +    free(CONST_CAST(void *, patterns->items[0].spec));
>>>>> +    free(CONST_CAST(void *, patterns->items[0].mask)); #endif /*
>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>> +
>>>>>     if (patterns->tnl_pmd_items) {
>>>>>         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
>>>>>         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt;
>>>>> @@
>>>>> -1049,7 +1070,12 @@ free_flow_patterns(struct flow_patterns
>>>>> *patterns)
>>>>>         }
>>>>>     }
>>>>>
>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>> +    for (i = 1 /* REPRESENTED_PORT */ + patterns->tnl_pmd_items_cnt;
>>>>> +         i < patterns->cnt; i++) { #else /* !
>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++) {
>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>         if (patterns->items[i].spec) {
>>>>>             free(CONST_CAST(void *, patterns->items[i].spec));
>>>>>         }
>>>>> @@ -1383,10 +1409,23 @@ parse_flow_match(struct netdev *netdev,
>>>>>                  struct flow_patterns *patterns,
>>>>>                  struct match *match) {
>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>> +    struct netdev *physdev = netdev_ports_get(orig_in_port,
>>>>> +netdev-
>>>>>> dpif_type);
>>>>> +    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof
>>> *ethdev_spec);
>>>>> +    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof
>>>>> *ethdev_mask);
>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
>>>>>     struct flow *consumed_masks;
>>>>>     uint8_t proto = 0;
>>>>>
>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>> +    /* Add an explicit traffic source item to the beginning of the pattern.
>*/
>>>>> +    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
>>>>> +    *ethdev_mask = rte_flow_item_ethdev_mask;
>>>>> +    add_flow_pattern(patterns,
>>>>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
>>>>> +                     ethdev_spec, ethdev_mask, NULL); #endif /*
>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>> +
>>>>>     consumed_masks = &match->wc.masks;
>>>>>
>>>>>     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) { @@ -2333,6
>>>>> +2372,7 @@ netdev_offload_dpdk_flow_destroy(struct
>>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>>     struct netdev *physdev;
>>>>>     struct netdev *netdev;
>>>>>     ovs_u128 *ufid;
>>>>> +    bool transfer;
>>>>>     int ret;
>>>>>
>>>>>     ovs_mutex_lock(&rte_flow_data->lock);
>>>>> @@ -2344,12 +2384,13 @@ netdev_offload_dpdk_flow_destroy(struct
>>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>>
>>>>>     rte_flow_data->dead = true;
>>>>>
>>>>> +    transfer = rte_flow_data->actions_offloaded;
>>>>>     rte_flow = rte_flow_data->rte_flow;
>>>>>     physdev = rte_flow_data->physdev;
>>>>>     netdev = rte_flow_data->netdev;
>>>>>     ufid = &rte_flow_data->ufid;
>>>>>
>>>>> -    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
>>>>> +    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer,
>>>>> + rte_flow, &error);
>>>>>
>>>>>     if (ret == 0) {
>>>>>         struct netdev_offload_dpdk_data *data;
>>>>> --
>>>>> 2.30.2
>>>>
>>>>
>>
Ivan Malov June 8, 2022, 7:02 p.m. UTC | #7
Hi Eli,

On Wed, 8 Jun 2022, Eli Britstein wrote:

> Hi Ivan,
>
>> -----Original Message-----
>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>> Sent: Wednesday, June 8, 2022 5:46 PM
>> To: Eli Britstein <elibr@nvidia.com>
>> Cc: dev@openvswitch.org; Andrew Rybchenko
>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>> Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>> <thomas@monjalon.net>; Stephen Hemminger
>> <stephen@networkplumber.org>; David Marchand
>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>> Coquelin <maxime.coquelin@redhat.com>
>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>> mechanism
>>
>> External email: Use caution opening links or attachments
>>
>>
>> Hi Eli,
>>
>> On Wed, 8 Jun 2022, Eli Britstein wrote:
>>
>>> Hi Ivan,
>>>
>>>> -----Original Message-----
>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>> Sent: Tuesday, June 7, 2022 11:56 PM
>>>> To: Eli Britstein <elibr@nvidia.com>
>>>> Cc: dev@openvswitch.org; Andrew Rybchenko
>>>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>>>> Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>>> <thomas@monjalon.net>; Stephen Hemminger
>>>> <stephen@networkplumber.org>; David Marchand
>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>>>> Coquelin <maxime.coquelin@redhat.com>
>>>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>>> mechanism
>>>>
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> Hi Eli,
>>>>
>>>> On Wed, 1 Jun 2022, Eli Britstein wrote:
>>>>
>>>>> - Missing proper handling of the testpmd syntax logging. It changes
>>>>> the used
>>>> port according to "transfer", but the log still uses
>> netdev_dpdk_get_port_id().
>>>>
>>>> Thanks for noticing. I will see to it in the next version.
>>>>
>>>>> - The usage of the "proxy" port for rte-flow implies that this proxy
>>>>> port is
>>>> attached to OVS, otherwise it is not "started" and creation of flows will
>> fail.
>>>>
>>>> That's the way it is. If there is no proxy for a given port, then the
>>>> original port value will be used for managing flows. For vendors that
>>>> don't need the proxy, this will work. For others, it won't. That's OK.
>>
>>> I don't really understand why this can't be done inside dpdk domain (if there
>> is a proxy, and it is up, use it, otherwise don't).
>>> That's *currently* the way it is. I understand that if dpdk works like this OVS
>> should align, but maybe you or someone else here knows why dpdk works like
>> this? (not too late to change, this is experimental...).
>>
>>
>> Regardless of DPDK, on some NICs, it is possible to insert rules via
>> unprivileged PFs or VFs, but there are also NICs which cannot do it.
>>
>> In DPDK, this contradiction has to be resolved somehow.
>> In example, for NICs that can only manage flows via privileged ports, two
>> possible solutions exist:
>>
>> 1. Route flow management requests from unprivileged ethdevs
>>    to the privileged one implicitly, inside the PMD. This
>>    is transparent to users, but, at the same time, it is
>>    tricky because the application does not realise that
>>    flows it manages via an ethdev "B" are in fact
>>    communicated to the NIC via an ethdev "A".
>>
>>    Unbeknownst of the implicit scheme, the application may
>>    detach the privileged ethdev "A" in-between. And, when
>>    time comes to remove flows, doing so via ethdev "B"
>>    will fail. This scheme breaks in-app housekeeping.
>>
>> 2. Expose the "proxy" port existence to the application.
>>    If it knows the truth about the real ethdev that
>>    handles the transfer flows, it won't attempt to
>>    detach it in-between. The housekeeping is fine.
>>
>> Outing the existence of the "proxy" port to users seems like the most
>> reasonable approach. This is why it was implemented in DPDK like this.
>> Currently, it's indeed an experimental feature. DPDK PMDs which need it, are
>> supposed to switch to it during the transition phase.
> Thanks very much for the explanation, though IMHO relevant PMDs could 
still hide it and not do this "outing" of their internals.


Sort of yes, they could hide it. But that would mean doing additional
record-keeping internally, in order to return EBUSY when the app asks
to detach the privileged port which still has active flows on it that
have been originally requested via an unprivileged port. Might be
quite error prone. Also, given the fact that quite a few vendors
might need this, isn't it better to make the feature generic?


>>
>> However, I should stress out that to NICs that support managing transfer
>> flows on any PFs and VFs, this proxy scheme is a don't care. The
>> corresponding drivers may not implement the proxy query method at all:
>>
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
>> b.com%2FDPDK%2Fdpdk%2Fblob%2Fmain%2Flib%2Fethdev%2Frte_flow.c%2
>> 3L1345&amp;data=05%7C01%7Celibr%40nvidia.com%7Cf5a80eb00f0342498
>> 63308da495dab8b%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C6
>> 37902963929533013%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMD
>> AiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C
>> &amp;sdata=ojwUOsPlz09NXtDXfeO8lAT%2BHcgGYWNRdIhxB6f0cy0%3D&a
>> mp;reserved=0
>>
>> The generic part of the API will just return the original port ID to the
>> application.
> Yes, I saw that. Thanks.


You're very welcome.


>>
>>
>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>>> Sent: Monday, May 30, 2022 5:16 PM
>>>>>> To: dev@openvswitch.org
>>>>>> Cc: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>; Ilya
>> Maximets
>>>>>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>; Eli Britstein
>>>>>> <elibr@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>>>>> <thomas@monjalon.net>; Stephen Hemminger
>>>>>> <stephen@networkplumber.org>; David Marchand
>>>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>;
>> Maxime
>>>>>> Coquelin <maxime.coquelin@redhat.com>
>>>>>> Subject: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>>>>> mechanism
>>>>>>
>>>>>> External email: Use caution opening links or attachments
>>>>>>
>>>>>>
>>>>>> Manage "transfer" flows via the corresponding mechanism.
>>>>>> Doing so requires that the traffic source be specified explicitly,
>>>>>> via the corresponding pattern item.
>>>>>>
>>>>>> Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>>> Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>>>>>> ---
>>>>>> lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++-------
>>>>>> lib/netdev-dpdk.h         |  2 +-
>>>>>> lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
>>>>>> 3 files changed, 103 insertions(+), 15 deletions(-)
>>>>>>
>>>>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>>>>>> 45e5d26d2..d0bf4613a 100644
>>>>>> --- a/lib/netdev-dpdk.c
>>>>>> +++ b/lib/netdev-dpdk.c
>>>>>> @@ -420,6 +420,7 @@ enum dpdk_hw_ol_features {
>>>>>>
>>>>>> struct netdev_dpdk {
>>>>>>     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE,
>>>> cacheline0,
>>>>>> +        dpdk_port_t flow_transfer_proxy_port_id;
>>>>>>         dpdk_port_t port_id;
>>>>>>
>>>>>>         /* If true, device was attached by rte_eth_dev_attach(). */
>>>>>> @@ -1115,6
>>>>>> +1116,23 @@ dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk
>> *dev)
>>>>>>                   DPDK_PORT_ID_FMT, dev->port_id);
>>>>>>     }
>>>>>> }
>>>>>> +
>>>>>> +static void
>>>>>> +dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev) {
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    ret = rte_flow_pick_transfer_proxy(dev->port_id,
>>>>>> +                                       &dev->flow_transfer_proxy_port_id, NULL);
>>>>>> +    if (ret == 0)
>>>>>> +        return;
>>>>>> +
>>>>>> +    /*
>>>>>> +     * The PMD does not indicate the proxy port.
>>>>>> +     * It is OK to assume the proxy is unneeded.
>>>>>> +     */
>>>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id; }
>>>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>
>>>>>> static int
>>>>>> @@ -1141,6 +1159,19 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>>>>>>      * Request delivery of such metadata.
>>>>>>      */
>>>>>>     dpdk_eth_dev_init_rx_metadata(dev);
>>>>>> +
>>>>>> +    /*
>>>>>> +     * Managing "transfer" flows requires that the user communicate
>> them
>>>>>> +     * via a port which has the privilege to control the embedded switch.
>>>>>> +     * For some vendors, all ports in a given switching domain have
>>>>>> +     * this privilege. For other vendors, it's only one port.
>>>>>> +     *
>>>>>> +     * Get the proxy port ID and remember it for later use.
>>>>>> +     */
>>>>>> +    dpdk_eth_dev_init_flow_transfer_proxy(dev);
>>>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>>>> +    /* It is OK to assume the proxy is unneeded. */
>>>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id;
>>>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>
>>>>>>     rte_eth_dev_info_get(dev->port_id, &info); @@ -5214,13 +5245,15
>>>>>> @@ out:
>>>>>>
>>>>>> int
>>>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>>>> -                             struct rte_flow *rte_flow,
>>>>>> +                             bool transfer, struct rte_flow
>>>>>> + *rte_flow,
>>>>>>                              struct rte_flow_error *error) {
>>>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>>>     int ret;
>>>>>>
>>>>>> -    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
>>>>>> +    ret = rte_flow_destroy(transfer ?
>>>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>>>> +                           rte_flow, error);
>>>>>>     return ret;
>>>>>> }
>>>>>>
>>>>>> @@ -5234,7 +5267,19 @@ netdev_dpdk_rte_flow_create(struct netdev
>>>>>> *netdev,
>>>>>>     struct rte_flow *flow;
>>>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>>>
>>>>>> -    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>> +    if (!attr->transfer) {
>>>>>> +        /*
>>>>>> +         * The 1st item in any pattern is a traffic source one.
>>>>>> +         * It is unnecessary in the case of non-transfer rules.
>>>>>> +         */
>>>>>> +        ++(items);
>>>>>> +    }
>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>> +
>>>>>> +    flow = rte_flow_create(attr->transfer ?
>>>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>>>> +                           attr, items, actions, error);
>>>>>>     return flow;
>>>>>> }
>>>>>>
>>>>>> @@ -5262,7 +5307,8 @@ netdev_dpdk_rte_flow_query_count(struct
>>>> netdev
>>>>>> *netdev,
>>>>>>     }
>>>>>>
>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>> -    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
>>>>>> +    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
>>>>>> +                         actions, query, error);
>>>>>>     return ret;
>>>>>> }
>>>>>>
>>>>>> @@ -5284,8 +5330,8 @@
>> netdev_dpdk_rte_flow_tunnel_decap_set(struct
>>>>>> netdev *netdev,
>>>>>>
>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>> -    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
>>>>>> -                                    num_of_actions, error);
>>>>>> +    ret =
>>>>>> + rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id,
>>>>>> tunnel,
>>>>>> +                                    actions, num_of_actions,
>>>>>> + error);
>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>     return ret;
>>>>>> }
>>>>>> @@ -5306,8 +5352,8 @@ netdev_dpdk_rte_flow_tunnel_match(struct
>>>>>> netdev *netdev,
>>>>>>
>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>> -    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items,
>>>> num_of_items,
>>>>>> -                                error);
>>>>>> +    ret = rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id,
>>>> tunnel,
>>>>>> +                                items, num_of_items, error);
>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>     return ret;
>>>>>> }
>>>>>> @@ -5328,7 +5374,8 @@
>> netdev_dpdk_rte_flow_get_restore_info(struct
>>>>>> netdev *netdev,
>>>>>>
>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>> -    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
>>>>>> +    ret = rte_flow_get_restore_info(dev->flow_transfer_proxy_port_id,
>>>>>> +                                    m, info, error);
>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>     return ret;
>>>>>> }
>>>>>> @@ -5349,8 +5396,8 @@
>>>>>> netdev_dpdk_rte_flow_tunnel_action_decap_release(
>>>>>>
>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>> -    ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
>>>>>> -                                               num_of_actions, error);
>>>>>> +    ret = rte_flow_tunnel_action_decap_release(dev-
>>>>>>> flow_transfer_proxy_port_id,
>>>>>> +                                               actions,
>>>>>> + num_of_actions, error);
>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>     return ret;
>>>>>> }
>>>>>> @@ -5370,8 +5417,8 @@
>>>> netdev_dpdk_rte_flow_tunnel_item_release(struct
>>>>>> netdev *netdev,
>>>>>>
>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>> -    ret = rte_flow_tunnel_item_release(dev->port_id, items,
>>>> num_of_items,
>>>>>> -                                       error);
>>>>>> +    ret = rte_flow_tunnel_item_release(dev-
>>> flow_transfer_proxy_port_id,
>>>>>> +                                       items, num_of_items,
>>>>>> + error);
>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>     return ret;
>>>>>> }
>>>>>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index
>>>>>> 699be3fb4..1dd5953a4 100644
>>>>>> --- a/lib/netdev-dpdk.h
>>>>>> +++ b/lib/netdev-dpdk.h
>>>>>> @@ -35,7 +35,7 @@ bool netdev_dpdk_flow_api_supported(struct
>> netdev
>>>>>> *);
>>>>>>
>>>>>> int
>>>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>>>> -                             struct rte_flow *rte_flow,
>>>>>> +                             bool transfer, struct rte_flow
>>>>>> + *rte_flow,
>>>>>>                              struct rte_flow_error *error); struct
>>>>>> rte_flow * netdev_dpdk_rte_flow_create(struct netdev *netdev, diff
>>>>>> --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index
>>>>>> 9cd5a0159..f8b90cbf7 100644
>>>>>> --- a/lib/netdev-offload-dpdk.c
>>>>>> +++ b/lib/netdev-offload-dpdk.c
>>>>>> @@ -353,8 +353,23 @@ dump_flow_pattern(struct ds *s,
>>>>>>
>>>>>>     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
>>>>>>         ds_put_cstr(s, "end ");
>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>> +    } else if (item->type ==
>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
>>>>>> +        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
>>>>>> +        const struct rte_flow_item_ethdev *ethdev_mask =
>>>>>> +item->mask;
>>>>>> +
>>>>>> +        ds_put_cstr(s, "represented_port ");
>>>>>> +
>>>>>> +        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false,
>>>> "ethdev_port_id",
>>>>>> +                          "%"PRIu16, ethdev_spec->port_id,
>>>>>> +                          ethdev_mask->port_id, 0);
>>>>>> +    } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>>> +               pattern_index < 1 /* REPRESENTED_PORT */ +
>>>>>> +                               flow_patterns->tnl_pmd_items_cnt) {
>>>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>>>>     } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>>>                pattern_index < flow_patterns->tnl_pmd_items_cnt) {
>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>         return;
>>>>>>     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>>>>>>         const struct rte_flow_item_eth *eth_spec = item->spec; @@
>>>>>> -1035,6 +1050,12 @@ free_flow_patterns(struct flow_patterns
>>>>>> *patterns)
>>>>>>     struct rte_flow_error error;
>>>>>>     int i;
>>>>>>
>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>> +    /* REPRESENTED_PORT */
>>>>>> +    free(CONST_CAST(void *, patterns->items[0].spec));
>>>>>> +    free(CONST_CAST(void *, patterns->items[0].mask)); #endif /*
>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>> +
>>>>>>     if (patterns->tnl_pmd_items) {
>>>>>>         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
>>>>>>         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt;
>>>>>> @@
>>>>>> -1049,7 +1070,12 @@ free_flow_patterns(struct flow_patterns
>>>>>> *patterns)
>>>>>>         }
>>>>>>     }
>>>>>>
>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>> +    for (i = 1 /* REPRESENTED_PORT */ + patterns->tnl_pmd_items_cnt;
>>>>>> +         i < patterns->cnt; i++) { #else /* !
>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>>     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++) {
>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>         if (patterns->items[i].spec) {
>>>>>>             free(CONST_CAST(void *, patterns->items[i].spec));
>>>>>>         }
>>>>>> @@ -1383,10 +1409,23 @@ parse_flow_match(struct netdev *netdev,
>>>>>>                  struct flow_patterns *patterns,
>>>>>>                  struct match *match) {
>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>> +    struct netdev *physdev = netdev_ports_get(orig_in_port,
>>>>>> +netdev-
>>>>>>> dpif_type);
>>>>>> +    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof
>>>> *ethdev_spec);
>>>>>> +    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof
>>>>>> *ethdev_mask);
>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
>>>>>>     struct flow *consumed_masks;
>>>>>>     uint8_t proto = 0;
>>>>>>
>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>> +    /* Add an explicit traffic source item to the beginning of the pattern.
>> */
>>>>>> +    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
>>>>>> +    *ethdev_mask = rte_flow_item_ethdev_mask;
>>>>>> +    add_flow_pattern(patterns,
>>>>>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
>>>>>> +                     ethdev_spec, ethdev_mask, NULL); #endif /*
>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>> +
>>>>>>     consumed_masks = &match->wc.masks;
>>>>>>
>>>>>>     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) { @@ -2333,6
>>>>>> +2372,7 @@ netdev_offload_dpdk_flow_destroy(struct
>>>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>>>     struct netdev *physdev;
>>>>>>     struct netdev *netdev;
>>>>>>     ovs_u128 *ufid;
>>>>>> +    bool transfer;
>>>>>>     int ret;
>>>>>>
>>>>>>     ovs_mutex_lock(&rte_flow_data->lock);
>>>>>> @@ -2344,12 +2384,13 @@ netdev_offload_dpdk_flow_destroy(struct
>>>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>>>
>>>>>>     rte_flow_data->dead = true;
>>>>>>
>>>>>> +    transfer = rte_flow_data->actions_offloaded;
>>>>>>     rte_flow = rte_flow_data->rte_flow;
>>>>>>     physdev = rte_flow_data->physdev;
>>>>>>     netdev = rte_flow_data->netdev;
>>>>>>     ufid = &rte_flow_data->ufid;
>>>>>>
>>>>>> -    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
>>>>>> +    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer,
>>>>>> + rte_flow, &error);
>>>>>>
>>>>>>     if (ret == 0) {
>>>>>>         struct netdev_offload_dpdk_data *data;
>>>>>> --
>>>>>> 2.30.2
>>>>>
>>>>>
>>>
>
Eli Britstein June 8, 2022, 7:43 p.m. UTC | #8
Hi Ivan,

>-----Original Message-----
>From: Ivan Malov <ivan.malov@oktetlabs.ru>
>Sent: Wednesday, June 8, 2022 10:02 PM
>To: Eli Britstein <elibr@nvidia.com>
>Cc: dev@openvswitch.org; Andrew Rybchenko
><andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
><thomas@monjalon.net>; Stephen Hemminger
><stephen@networkplumber.org>; David Marchand
><david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>Coquelin <maxime.coquelin@redhat.com>
>Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>mechanism
>
>External email: Use caution opening links or attachments
>
>
>Hi Eli,
>
>On Wed, 8 Jun 2022, Eli Britstein wrote:
>
>> Hi Ivan,
>>
>>> -----Original Message-----
>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>> Sent: Wednesday, June 8, 2022 5:46 PM
>>> To: Eli Britstein <elibr@nvidia.com>
>>> Cc: dev@openvswitch.org; Andrew Rybchenko
>>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>>> Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>> <thomas@monjalon.net>; Stephen Hemminger
>>> <stephen@networkplumber.org>; David Marchand
>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>>> Coquelin <maxime.coquelin@redhat.com>
>>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>> mechanism
>>>
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> Hi Eli,
>>>
>>> On Wed, 8 Jun 2022, Eli Britstein wrote:
>>>
>>>> Hi Ivan,
>>>>
>>>>> -----Original Message-----
>>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>> Sent: Tuesday, June 7, 2022 11:56 PM
>>>>> To: Eli Britstein <elibr@nvidia.com>
>>>>> Cc: dev@openvswitch.org; Andrew Rybchenko
>>>>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets
>>>>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>;
>>>>> NBU-Contact-Thomas Monjalon (EXTERNAL) <thomas@monjalon.net>;
>>>>> Stephen Hemminger <stephen@networkplumber.org>; David Marchand
>>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>;
>Maxime
>>>>> Coquelin <maxime.coquelin@redhat.com>
>>>>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer
>>>>> proxy mechanism
>>>>>
>>>>> External email: Use caution opening links or attachments
>>>>>
>>>>>
>>>>> Hi Eli,
>>>>>
>>>>> On Wed, 1 Jun 2022, Eli Britstein wrote:
>>>>>
>>>>>> - Missing proper handling of the testpmd syntax logging. It
>>>>>> changes the used
>>>>> port according to "transfer", but the log still uses
>>> netdev_dpdk_get_port_id().
>>>>>
>>>>> Thanks for noticing. I will see to it in the next version.
>>>>>
>>>>>> - The usage of the "proxy" port for rte-flow implies that this
>>>>>> proxy port is
>>>>> attached to OVS, otherwise it is not "started" and creation of
>>>>> flows will
>>> fail.
>>>>>
>>>>> That's the way it is. If there is no proxy for a given port, then
>>>>> the original port value will be used for managing flows. For
>>>>> vendors that don't need the proxy, this will work. For others, it won't.
>That's OK.
>>>
>>>> I don't really understand why this can't be done inside dpdk domain
>>>> (if there
>>> is a proxy, and it is up, use it, otherwise don't).
>>>> That's *currently* the way it is. I understand that if dpdk works
>>>> like this OVS
>>> should align, but maybe you or someone else here knows why dpdk works
>>> like this? (not too late to change, this is experimental...).
>>>
>>>
>>> Regardless of DPDK, on some NICs, it is possible to insert rules via
>>> unprivileged PFs or VFs, but there are also NICs which cannot do it.
>>>
>>> In DPDK, this contradiction has to be resolved somehow.
>>> In example, for NICs that can only manage flows via privileged ports,
>>> two possible solutions exist:
>>>
>>> 1. Route flow management requests from unprivileged ethdevs
>>>    to the privileged one implicitly, inside the PMD. This
>>>    is transparent to users, but, at the same time, it is
>>>    tricky because the application does not realise that
>>>    flows it manages via an ethdev "B" are in fact
>>>    communicated to the NIC via an ethdev "A".
>>>
>>>    Unbeknownst of the implicit scheme, the application may
>>>    detach the privileged ethdev "A" in-between. And, when
>>>    time comes to remove flows, doing so via ethdev "B"
>>>    will fail. This scheme breaks in-app housekeeping.
>>>
>>> 2. Expose the "proxy" port existence to the application.
>>>    If it knows the truth about the real ethdev that
>>>    handles the transfer flows, it won't attempt to
>>>    detach it in-between. The housekeeping is fine.
>>>
>>> Outing the existence of the "proxy" port to users seems like the most
>>> reasonable approach. This is why it was implemented in DPDK like this.
>>> Currently, it's indeed an experimental feature. DPDK PMDs which need
>>> it, are supposed to switch to it during the transition phase.
>> Thanks very much for the explanation, though IMHO relevant PMDs could
>still hide it and not do this "outing" of their internals.
>
>
>Sort of yes, they could hide it. But that would mean doing additional record-
>keeping internally, in order to return EBUSY when the app asks to detach the
>privileged port which still has active flows on it that have been originally
>requested via an unprivileged port. Might be quite error prone. Also, given the
>fact that quite a few vendors might need this, isn't it better to make the
>feature generic?
Discussing such scenario, this patch does not handle it. Suppose A is the privileged port, served as a proxy for port B.
Now, there are flows applied on port B (but actually on A). Nothing prevents OVS to detach port A. The flows applied on port B remain ghosted in OVS. When they are being removed, the behavior is unknown, and can also crash.
Am I wrong?
>
>
>>>
>>> However, I should stress out that to NICs that support managing
>>> transfer flows on any PFs and VFs, this proxy scheme is a don't care.
>>> The corresponding drivers may not implement the proxy query method at
>all:
>>>
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit
>>> hu
>>>
>b.com%2FDPDK%2Fdpdk%2Fblob%2Fmain%2Flib%2Fethdev%2Frte_flow.c%2
>>>
>3L1345&amp;data=05%7C01%7Celibr%40nvidia.com%7Cf5a80eb00f0342498
>>>
>63308da495dab8b%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C6
>>>
>37902963929533013%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMD
>>>
>AiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C
>>>
>&amp;sdata=ojwUOsPlz09NXtDXfeO8lAT%2BHcgGYWNRdIhxB6f0cy0%3D&a
>>> mp;reserved=0
>>>
>>> The generic part of the API will just return the original port ID to
>>> the application.
>> Yes, I saw that. Thanks.
>
>
>You're very welcome.
>
>
>>>
>>>
>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>>>> Sent: Monday, May 30, 2022 5:16 PM
>>>>>>> To: dev@openvswitch.org
>>>>>>> Cc: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>; Ilya
>>> Maximets
>>>>>>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>; Eli Britstein
>>>>>>> <elibr@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>>>>>> <thomas@monjalon.net>; Stephen Hemminger
>>>>>>> <stephen@networkplumber.org>; David Marchand
>>>>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>;
>>> Maxime
>>>>>>> Coquelin <maxime.coquelin@redhat.com>
>>>>>>> Subject: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>>>>>> mechanism
>>>>>>>
>>>>>>> External email: Use caution opening links or attachments
>>>>>>>
>>>>>>>
>>>>>>> Manage "transfer" flows via the corresponding mechanism.
>>>>>>> Doing so requires that the traffic source be specified
>>>>>>> explicitly, via the corresponding pattern item.
>>>>>>>
>>>>>>> Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>>>> Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>>>>>>> ---
>>>>>>> lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++----
>---
>>>>>>> lib/netdev-dpdk.h         |  2 +-
>>>>>>> lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
>>>>>>> 3 files changed, 103 insertions(+), 15 deletions(-)
>>>>>>>
>>>>>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>>>>>>> 45e5d26d2..d0bf4613a 100644
>>>>>>> --- a/lib/netdev-dpdk.c
>>>>>>> +++ b/lib/netdev-dpdk.c
>>>>>>> @@ -420,6 +420,7 @@ enum dpdk_hw_ol_features {
>>>>>>>
>>>>>>> struct netdev_dpdk {
>>>>>>>     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE,
>>>>> cacheline0,
>>>>>>> +        dpdk_port_t flow_transfer_proxy_port_id;
>>>>>>>         dpdk_port_t port_id;
>>>>>>>
>>>>>>>         /* If true, device was attached by rte_eth_dev_attach().
>>>>>>> */ @@ -1115,6
>>>>>>> +1116,23 @@ dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk
>>> *dev)
>>>>>>>                   DPDK_PORT_ID_FMT, dev->port_id);
>>>>>>>     }
>>>>>>> }
>>>>>>> +
>>>>>>> +static void
>>>>>>> +dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev) {
>>>>>>> +    int ret;
>>>>>>> +
>>>>>>> +    ret = rte_flow_pick_transfer_proxy(dev->port_id,
>>>>>>> +                                       &dev->flow_transfer_proxy_port_id, NULL);
>>>>>>> +    if (ret == 0)
>>>>>>> +        return;
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * The PMD does not indicate the proxy port.
>>>>>>> +     * It is OK to assume the proxy is unneeded.
>>>>>>> +     */
>>>>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id; }
>>>>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>
>>>>>>> static int
>>>>>>> @@ -1141,6 +1159,19 @@ dpdk_eth_dev_init(struct netdev_dpdk
>*dev)
>>>>>>>      * Request delivery of such metadata.
>>>>>>>      */
>>>>>>>     dpdk_eth_dev_init_rx_metadata(dev);
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * Managing "transfer" flows requires that the user
>>>>>>> + communicate
>>> them
>>>>>>> +     * via a port which has the privilege to control the embedded
>switch.
>>>>>>> +     * For some vendors, all ports in a given switching domain have
>>>>>>> +     * this privilege. For other vendors, it's only one port.
>>>>>>> +     *
>>>>>>> +     * Get the proxy port ID and remember it for later use.
>>>>>>> +     */
>>>>>>> +    dpdk_eth_dev_init_flow_transfer_proxy(dev);
>>>>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>>>>> +    /* It is OK to assume the proxy is unneeded. */
>>>>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id;
>>>>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>
>>>>>>>     rte_eth_dev_info_get(dev->port_id, &info); @@ -5214,13
>>>>>>> +5245,15 @@ out:
>>>>>>>
>>>>>>> int
>>>>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>>>>> -                             struct rte_flow *rte_flow,
>>>>>>> +                             bool transfer, struct rte_flow
>>>>>>> + *rte_flow,
>>>>>>>                              struct rte_flow_error *error) {
>>>>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>>>>     int ret;
>>>>>>>
>>>>>>> -    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
>>>>>>> +    ret = rte_flow_destroy(transfer ?
>>>>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>>>>> +                           rte_flow, error);
>>>>>>>     return ret;
>>>>>>> }
>>>>>>>
>>>>>>> @@ -5234,7 +5267,19 @@ netdev_dpdk_rte_flow_create(struct
>netdev
>>>>>>> *netdev,
>>>>>>>     struct rte_flow *flow;
>>>>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>>>>
>>>>>>> -    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>> +    if (!attr->transfer) {
>>>>>>> +        /*
>>>>>>> +         * The 1st item in any pattern is a traffic source one.
>>>>>>> +         * It is unnecessary in the case of non-transfer rules.
>>>>>>> +         */
>>>>>>> +        ++(items);
>>>>>>> +    }
>>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>> +
>>>>>>> +    flow = rte_flow_create(attr->transfer ?
>>>>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>>>>> +                           attr, items, actions, error);
>>>>>>>     return flow;
>>>>>>> }
>>>>>>>
>>>>>>> @@ -5262,7 +5307,8 @@ netdev_dpdk_rte_flow_query_count(struct
>>>>> netdev
>>>>>>> *netdev,
>>>>>>>     }
>>>>>>>
>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>> -    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
>>>>>>> +    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
>>>>>>> +                         actions, query, error);
>>>>>>>     return ret;
>>>>>>> }
>>>>>>>
>>>>>>> @@ -5284,8 +5330,8 @@
>>> netdev_dpdk_rte_flow_tunnel_decap_set(struct
>>>>>>> netdev *netdev,
>>>>>>>
>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>> -    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
>>>>>>> -                                    num_of_actions, error);
>>>>>>> +    ret =
>>>>>>> + rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id,
>>>>>>> tunnel,
>>>>>>> +                                    actions, num_of_actions,
>>>>>>> + error);
>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>     return ret;
>>>>>>> }
>>>>>>> @@ -5306,8 +5352,8 @@
>netdev_dpdk_rte_flow_tunnel_match(struct
>>>>>>> netdev *netdev,
>>>>>>>
>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>> -    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items,
>>>>> num_of_items,
>>>>>>> -                                error);
>>>>>>> +    ret =
>>>>>>> + rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id,
>>>>> tunnel,
>>>>>>> +                                items, num_of_items, error);
>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>     return ret;
>>>>>>> }
>>>>>>> @@ -5328,7 +5374,8 @@
>>> netdev_dpdk_rte_flow_get_restore_info(struct
>>>>>>> netdev *netdev,
>>>>>>>
>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>> -    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
>>>>>>> +    ret = rte_flow_get_restore_info(dev-
>>flow_transfer_proxy_port_id,
>>>>>>> +                                    m, info, error);
>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>     return ret;
>>>>>>> }
>>>>>>> @@ -5349,8 +5396,8 @@
>>>>>>> netdev_dpdk_rte_flow_tunnel_action_decap_release(
>>>>>>>
>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>> -    ret = rte_flow_tunnel_action_decap_release(dev->port_id,
>actions,
>>>>>>> -                                               num_of_actions, error);
>>>>>>> +    ret = rte_flow_tunnel_action_decap_release(dev-
>>>>>>>> flow_transfer_proxy_port_id,
>>>>>>> +                                               actions,
>>>>>>> + num_of_actions, error);
>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>     return ret;
>>>>>>> }
>>>>>>> @@ -5370,8 +5417,8 @@
>>>>> netdev_dpdk_rte_flow_tunnel_item_release(struct
>>>>>>> netdev *netdev,
>>>>>>>
>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>> -    ret = rte_flow_tunnel_item_release(dev->port_id, items,
>>>>> num_of_items,
>>>>>>> -                                       error);
>>>>>>> +    ret = rte_flow_tunnel_item_release(dev-
>>>> flow_transfer_proxy_port_id,
>>>>>>> +                                       items, num_of_items,
>>>>>>> + error);
>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>     return ret;
>>>>>>> }
>>>>>>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index
>>>>>>> 699be3fb4..1dd5953a4 100644
>>>>>>> --- a/lib/netdev-dpdk.h
>>>>>>> +++ b/lib/netdev-dpdk.h
>>>>>>> @@ -35,7 +35,7 @@ bool netdev_dpdk_flow_api_supported(struct
>>> netdev
>>>>>>> *);
>>>>>>>
>>>>>>> int
>>>>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>>>>> -                             struct rte_flow *rte_flow,
>>>>>>> +                             bool transfer, struct rte_flow
>>>>>>> + *rte_flow,
>>>>>>>                              struct rte_flow_error *error);
>>>>>>> struct rte_flow * netdev_dpdk_rte_flow_create(struct netdev
>>>>>>> *netdev, diff --git a/lib/netdev-offload-dpdk.c
>>>>>>> b/lib/netdev-offload-dpdk.c index
>>>>>>> 9cd5a0159..f8b90cbf7 100644
>>>>>>> --- a/lib/netdev-offload-dpdk.c
>>>>>>> +++ b/lib/netdev-offload-dpdk.c
>>>>>>> @@ -353,8 +353,23 @@ dump_flow_pattern(struct ds *s,
>>>>>>>
>>>>>>>     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
>>>>>>>         ds_put_cstr(s, "end ");
>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>> +    } else if (item->type ==
>>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
>>>>>>> +        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
>>>>>>> +        const struct rte_flow_item_ethdev *ethdev_mask =
>>>>>>> +item->mask;
>>>>>>> +
>>>>>>> +        ds_put_cstr(s, "represented_port ");
>>>>>>> +
>>>>>>> +        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false,
>>>>> "ethdev_port_id",
>>>>>>> +                          "%"PRIu16, ethdev_spec->port_id,
>>>>>>> +                          ethdev_mask->port_id, 0);
>>>>>>> +    } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>>>> +               pattern_index < 1 /* REPRESENTED_PORT */ +
>>>>>>> +                               flow_patterns->tnl_pmd_items_cnt)
>>>>>>> +{ #else /* ! ALLOW_EXPERIMENTAL_API */
>>>>>>>     } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>>>>                pattern_index < flow_patterns->tnl_pmd_items_cnt)
>>>>>>> {
>>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>         return;
>>>>>>>     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>>>>>>>         const struct rte_flow_item_eth *eth_spec = item->spec; @@
>>>>>>> -1035,6 +1050,12 @@ free_flow_patterns(struct flow_patterns
>>>>>>> *patterns)
>>>>>>>     struct rte_flow_error error;
>>>>>>>     int i;
>>>>>>>
>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>> +    /* REPRESENTED_PORT */
>>>>>>> +    free(CONST_CAST(void *, patterns->items[0].spec));
>>>>>>> +    free(CONST_CAST(void *, patterns->items[0].mask)); #endif /*
>>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>>> +
>>>>>>>     if (patterns->tnl_pmd_items) {
>>>>>>>         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
>>>>>>>         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt;
>>>>>>> @@
>>>>>>> -1049,7 +1070,12 @@ free_flow_patterns(struct flow_patterns
>>>>>>> *patterns)
>>>>>>>         }
>>>>>>>     }
>>>>>>>
>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>> +    for (i = 1 /* REPRESENTED_PORT */ + patterns-
>>tnl_pmd_items_cnt;
>>>>>>> +         i < patterns->cnt; i++) { #else /* !
>>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>>>     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++)
>>>>>>> {
>>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>         if (patterns->items[i].spec) {
>>>>>>>             free(CONST_CAST(void *, patterns->items[i].spec));
>>>>>>>         }
>>>>>>> @@ -1383,10 +1409,23 @@ parse_flow_match(struct netdev
>*netdev,
>>>>>>>                  struct flow_patterns *patterns,
>>>>>>>                  struct match *match) {
>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>> +    struct netdev *physdev = netdev_ports_get(orig_in_port,
>>>>>>> +netdev-
>>>>>>>> dpif_type);
>>>>>>> +    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof
>>>>> *ethdev_spec);
>>>>>>> +    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof
>>>>>>> *ethdev_mask);
>>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
>>>>>>>     struct flow *consumed_masks;
>>>>>>>     uint8_t proto = 0;
>>>>>>>
>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>> +    /* Add an explicit traffic source item to the beginning of the
>pattern.
>>> */
>>>>>>> +    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
>>>>>>> +    *ethdev_mask = rte_flow_item_ethdev_mask;
>>>>>>> +    add_flow_pattern(patterns,
>>>>>>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
>>>>>>> +                     ethdev_spec, ethdev_mask, NULL); #endif /*
>>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>>> +
>>>>>>>     consumed_masks = &match->wc.masks;
>>>>>>>
>>>>>>>     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) { @@ -2333,6
>>>>>>> +2372,7 @@ netdev_offload_dpdk_flow_destroy(struct
>>>>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>>>>     struct netdev *physdev;
>>>>>>>     struct netdev *netdev;
>>>>>>>     ovs_u128 *ufid;
>>>>>>> +    bool transfer;
>>>>>>>     int ret;
>>>>>>>
>>>>>>>     ovs_mutex_lock(&rte_flow_data->lock);
>>>>>>> @@ -2344,12 +2384,13 @@
>netdev_offload_dpdk_flow_destroy(struct
>>>>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>>>>
>>>>>>>     rte_flow_data->dead = true;
>>>>>>>
>>>>>>> +    transfer = rte_flow_data->actions_offloaded;
>>>>>>>     rte_flow = rte_flow_data->rte_flow;
>>>>>>>     physdev = rte_flow_data->physdev;
>>>>>>>     netdev = rte_flow_data->netdev;
>>>>>>>     ufid = &rte_flow_data->ufid;
>>>>>>>
>>>>>>> -    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
>>>>>>> +    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer,
>>>>>>> + rte_flow, &error);
>>>>>>>
>>>>>>>     if (ret == 0) {
>>>>>>>         struct netdev_offload_dpdk_data *data;
>>>>>>> --
>>>>>>> 2.30.2
>>>>>>
>>>>>>
>>>>
>>
Ivan Malov July 20, 2022, 12:27 p.m. UTC | #9
Hi Eli,

Thank you for your review efforts. Please find the new version:

http://patchwork.ozlabs.org/project/openvswitch/patch/20220720121823.2497727-4-ivan.malov@oktetlabs.ru/

It cares about the detach scenario we've been talking about.
And it cares about port ID logging for transfer flows, too.

On Wed, 8 Jun 2022, Eli Britstein wrote:

> Hi Ivan,
>
>> -----Original Message-----
>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>> Sent: Wednesday, June 8, 2022 10:02 PM
>> To: Eli Britstein <elibr@nvidia.com>
>> Cc: dev@openvswitch.org; Andrew Rybchenko
>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>> Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>> <thomas@monjalon.net>; Stephen Hemminger
>> <stephen@networkplumber.org>; David Marchand
>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>> Coquelin <maxime.coquelin@redhat.com>
>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>> mechanism
>>
>> External email: Use caution opening links or attachments
>>
>>
>> Hi Eli,
>>
>> On Wed, 8 Jun 2022, Eli Britstein wrote:
>>
>>> Hi Ivan,
>>>
>>>> -----Original Message-----
>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>> Sent: Wednesday, June 8, 2022 5:46 PM
>>>> To: Eli Britstein <elibr@nvidia.com>
>>>> Cc: dev@openvswitch.org; Andrew Rybchenko
>>>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets <i.maximets@ovn.org>;
>>>> Ori Kam <orika@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>>> <thomas@monjalon.net>; Stephen Hemminger
>>>> <stephen@networkplumber.org>; David Marchand
>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>; Maxime
>>>> Coquelin <maxime.coquelin@redhat.com>
>>>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>>> mechanism
>>>>
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> Hi Eli,
>>>>
>>>> On Wed, 8 Jun 2022, Eli Britstein wrote:
>>>>
>>>>> Hi Ivan,
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>>> Sent: Tuesday, June 7, 2022 11:56 PM
>>>>>> To: Eli Britstein <elibr@nvidia.com>
>>>>>> Cc: dev@openvswitch.org; Andrew Rybchenko
>>>>>> <andrew.rybchenko@oktetlabs.ru>; Ilya Maximets
>>>>>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>;
>>>>>> NBU-Contact-Thomas Monjalon (EXTERNAL) <thomas@monjalon.net>;
>>>>>> Stephen Hemminger <stephen@networkplumber.org>; David Marchand
>>>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>;
>> Maxime
>>>>>> Coquelin <maxime.coquelin@redhat.com>
>>>>>> Subject: RE: [PATCH 3/3] netdev-offload-dpdk: use flow transfer
>>>>>> proxy mechanism
>>>>>>
>>>>>> External email: Use caution opening links or attachments
>>>>>>
>>>>>>
>>>>>> Hi Eli,
>>>>>>
>>>>>> On Wed, 1 Jun 2022, Eli Britstein wrote:
>>>>>>
>>>>>>> - Missing proper handling of the testpmd syntax logging. It
>>>>>>> changes the used
>>>>>> port according to "transfer", but the log still uses
>>>> netdev_dpdk_get_port_id().
>>>>>>
>>>>>> Thanks for noticing. I will see to it in the next version.
>>>>>>
>>>>>>> - The usage of the "proxy" port for rte-flow implies that this
>>>>>>> proxy port is
>>>>>> attached to OVS, otherwise it is not "started" and creation of
>>>>>> flows will
>>>> fail.
>>>>>>
>>>>>> That's the way it is. If there is no proxy for a given port, then
>>>>>> the original port value will be used for managing flows. For
>>>>>> vendors that don't need the proxy, this will work. For others, it won't.
>> That's OK.
>>>>
>>>>> I don't really understand why this can't be done inside dpdk domain
>>>>> (if there
>>>> is a proxy, and it is up, use it, otherwise don't).
>>>>> That's *currently* the way it is. I understand that if dpdk works
>>>>> like this OVS
>>>> should align, but maybe you or someone else here knows why dpdk works
>>>> like this? (not too late to change, this is experimental...).
>>>>
>>>>
>>>> Regardless of DPDK, on some NICs, it is possible to insert rules via
>>>> unprivileged PFs or VFs, but there are also NICs which cannot do it.
>>>>
>>>> In DPDK, this contradiction has to be resolved somehow.
>>>> In example, for NICs that can only manage flows via privileged ports,
>>>> two possible solutions exist:
>>>>
>>>> 1. Route flow management requests from unprivileged ethdevs
>>>>    to the privileged one implicitly, inside the PMD. This
>>>>    is transparent to users, but, at the same time, it is
>>>>    tricky because the application does not realise that
>>>>    flows it manages via an ethdev "B" are in fact
>>>>    communicated to the NIC via an ethdev "A".
>>>>
>>>>    Unbeknownst of the implicit scheme, the application may
>>>>    detach the privileged ethdev "A" in-between. And, when
>>>>    time comes to remove flows, doing so via ethdev "B"
>>>>    will fail. This scheme breaks in-app housekeeping.
>>>>
>>>> 2. Expose the "proxy" port existence to the application.
>>>>    If it knows the truth about the real ethdev that
>>>>    handles the transfer flows, it won't attempt to
>>>>    detach it in-between. The housekeeping is fine.
>>>>
>>>> Outing the existence of the "proxy" port to users seems like the most
>>>> reasonable approach. This is why it was implemented in DPDK like this.
>>>> Currently, it's indeed an experimental feature. DPDK PMDs which need
>>>> it, are supposed to switch to it during the transition phase.
>>> Thanks very much for the explanation, though IMHO relevant PMDs could
>> still hide it and not do this "outing" of their internals.
>>
>>
>> Sort of yes, they could hide it. But that would mean doing additional record-
>> keeping internally, in order to return EBUSY when the app asks to detach the
>> privileged port which still has active flows on it that have been originally
>> requested via an unprivileged port. Might be quite error prone. Also, given the
>> fact that quite a few vendors might need this, isn't it better to make the
>> feature generic?
> Discussing such scenario, this patch does not handle it. Suppose A is the privileged port, served as a proxy for port B.
> Now, there are flows applied on port B (but actually on A). Nothing prevents OVS to detach port A. The flows applied on port B remain ghosted in OVS. When they are being removed, the behavior is unknown, and can also crash.
> Am I wrong?
>>
>>
>>>>
>>>> However, I should stress out that to NICs that support managing
>>>> transfer flows on any PFs and VFs, this proxy scheme is a don't care.
>>>> The corresponding drivers may not implement the proxy query method at
>> all:
>>>>
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit
>>>> hu
>>>>
>> b.com%2FDPDK%2Fdpdk%2Fblob%2Fmain%2Flib%2Fethdev%2Frte_flow.c%2
>>>>
>> 3L1345&amp;data=05%7C01%7Celibr%40nvidia.com%7Cf5a80eb00f0342498
>>>>
>> 63308da495dab8b%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C6
>>>>
>> 37902963929533013%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMD
>>>>
>> AiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C
>>>>
>> &amp;sdata=ojwUOsPlz09NXtDXfeO8lAT%2BHcgGYWNRdIhxB6f0cy0%3D&a
>>>> mp;reserved=0
>>>>
>>>> The generic part of the API will just return the original port ID to
>>>> the application.
>>> Yes, I saw that. Thanks.
>>
>>
>> You're very welcome.
>>
>>
>>>>
>>>>
>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>>>>> Sent: Monday, May 30, 2022 5:16 PM
>>>>>>>> To: dev@openvswitch.org
>>>>>>>> Cc: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>; Ilya
>>>> Maximets
>>>>>>>> <i.maximets@ovn.org>; Ori Kam <orika@nvidia.com>; Eli Britstein
>>>>>>>> <elibr@nvidia.com>; NBU-Contact-Thomas Monjalon (EXTERNAL)
>>>>>>>> <thomas@monjalon.net>; Stephen Hemminger
>>>>>>>> <stephen@networkplumber.org>; David Marchand
>>>>>>>> <david.marchand@redhat.com>; Gaetan Rivet <grive@u256.net>;
>>>> Maxime
>>>>>>>> Coquelin <maxime.coquelin@redhat.com>
>>>>>>>> Subject: [PATCH 3/3] netdev-offload-dpdk: use flow transfer proxy
>>>>>>>> mechanism
>>>>>>>>
>>>>>>>> External email: Use caution opening links or attachments
>>>>>>>>
>>>>>>>>
>>>>>>>> Manage "transfer" flows via the corresponding mechanism.
>>>>>>>> Doing so requires that the traffic source be specified
>>>>>>>> explicitly, via the corresponding pattern item.
>>>>>>>>
>>>>>>>> Signed-off-by: Ivan Malov <ivan.malov@oktetlabs.ru>
>>>>>>>> Acked-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
>>>>>>>> ---
>>>>>>>> lib/netdev-dpdk.c         | 73 ++++++++++++++++++++++++++++++++----
>> ---
>>>>>>>> lib/netdev-dpdk.h         |  2 +-
>>>>>>>> lib/netdev-offload-dpdk.c | 43 ++++++++++++++++++++++-
>>>>>>>> 3 files changed, 103 insertions(+), 15 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>>>>>>>> 45e5d26d2..d0bf4613a 100644
>>>>>>>> --- a/lib/netdev-dpdk.c
>>>>>>>> +++ b/lib/netdev-dpdk.c
>>>>>>>> @@ -420,6 +420,7 @@ enum dpdk_hw_ol_features {
>>>>>>>>
>>>>>>>> struct netdev_dpdk {
>>>>>>>>     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE,
>>>>>> cacheline0,
>>>>>>>> +        dpdk_port_t flow_transfer_proxy_port_id;
>>>>>>>>         dpdk_port_t port_id;
>>>>>>>>
>>>>>>>>         /* If true, device was attached by rte_eth_dev_attach().
>>>>>>>> */ @@ -1115,6
>>>>>>>> +1116,23 @@ dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk
>>>> *dev)
>>>>>>>>                   DPDK_PORT_ID_FMT, dev->port_id);
>>>>>>>>     }
>>>>>>>> }
>>>>>>>> +
>>>>>>>> +static void
>>>>>>>> +dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev) {
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    ret = rte_flow_pick_transfer_proxy(dev->port_id,
>>>>>>>> +                                       &dev->flow_transfer_proxy_port_id, NULL);
>>>>>>>> +    if (ret == 0)
>>>>>>>> +        return;
>>>>>>>> +
>>>>>>>> +    /*
>>>>>>>> +     * The PMD does not indicate the proxy port.
>>>>>>>> +     * It is OK to assume the proxy is unneeded.
>>>>>>>> +     */
>>>>>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id; }
>>>>>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>>
>>>>>>>> static int
>>>>>>>> @@ -1141,6 +1159,19 @@ dpdk_eth_dev_init(struct netdev_dpdk
>> *dev)
>>>>>>>>      * Request delivery of such metadata.
>>>>>>>>      */
>>>>>>>>     dpdk_eth_dev_init_rx_metadata(dev);
>>>>>>>> +
>>>>>>>> +    /*
>>>>>>>> +     * Managing "transfer" flows requires that the user
>>>>>>>> + communicate
>>>> them
>>>>>>>> +     * via a port which has the privilege to control the embedded
>> switch.
>>>>>>>> +     * For some vendors, all ports in a given switching domain have
>>>>>>>> +     * this privilege. For other vendors, it's only one port.
>>>>>>>> +     *
>>>>>>>> +     * Get the proxy port ID and remember it for later use.
>>>>>>>> +     */
>>>>>>>> +    dpdk_eth_dev_init_flow_transfer_proxy(dev);
>>>>>>>> +#else /* ! ALLOW_EXPERIMENTAL_API */
>>>>>>>> +    /* It is OK to assume the proxy is unneeded. */
>>>>>>>> +    dev->flow_transfer_proxy_port_id = dev->port_id;
>>>>>>>> #endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>>
>>>>>>>>     rte_eth_dev_info_get(dev->port_id, &info); @@ -5214,13
>>>>>>>> +5245,15 @@ out:
>>>>>>>>
>>>>>>>> int
>>>>>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>>>>>> -                             struct rte_flow *rte_flow,
>>>>>>>> +                             bool transfer, struct rte_flow
>>>>>>>> + *rte_flow,
>>>>>>>>                              struct rte_flow_error *error) {
>>>>>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>>>>>     int ret;
>>>>>>>>
>>>>>>>> -    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
>>>>>>>> +    ret = rte_flow_destroy(transfer ?
>>>>>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>>>>>> +                           rte_flow, error);
>>>>>>>>     return ret;
>>>>>>>> }
>>>>>>>>
>>>>>>>> @@ -5234,7 +5267,19 @@ netdev_dpdk_rte_flow_create(struct
>> netdev
>>>>>>>> *netdev,
>>>>>>>>     struct rte_flow *flow;
>>>>>>>>     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>>>>>>>>
>>>>>>>> -    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
>>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>>> +    if (!attr->transfer) {
>>>>>>>> +        /*
>>>>>>>> +         * The 1st item in any pattern is a traffic source one.
>>>>>>>> +         * It is unnecessary in the case of non-transfer rules.
>>>>>>>> +         */
>>>>>>>> +        ++(items);
>>>>>>>> +    }
>>>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>> +
>>>>>>>> +    flow = rte_flow_create(attr->transfer ?
>>>>>>>> +                           dev->flow_transfer_proxy_port_id : dev->port_id,
>>>>>>>> +                           attr, items, actions, error);
>>>>>>>>     return flow;
>>>>>>>> }
>>>>>>>>
>>>>>>>> @@ -5262,7 +5307,8 @@ netdev_dpdk_rte_flow_query_count(struct
>>>>>> netdev
>>>>>>>> *netdev,
>>>>>>>>     }
>>>>>>>>
>>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>> -    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
>>>>>>>> +    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
>>>>>>>> +                         actions, query, error);
>>>>>>>>     return ret;
>>>>>>>> }
>>>>>>>>
>>>>>>>> @@ -5284,8 +5330,8 @@
>>>> netdev_dpdk_rte_flow_tunnel_decap_set(struct
>>>>>>>> netdev *netdev,
>>>>>>>>
>>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>>> -    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
>>>>>>>> -                                    num_of_actions, error);
>>>>>>>> +    ret =
>>>>>>>> + rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id,
>>>>>>>> tunnel,
>>>>>>>> +                                    actions, num_of_actions,
>>>>>>>> + error);
>>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>>     return ret;
>>>>>>>> }
>>>>>>>> @@ -5306,8 +5352,8 @@
>> netdev_dpdk_rte_flow_tunnel_match(struct
>>>>>>>> netdev *netdev,
>>>>>>>>
>>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>>> -    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items,
>>>>>> num_of_items,
>>>>>>>> -                                error);
>>>>>>>> +    ret =
>>>>>>>> + rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id,
>>>>>> tunnel,
>>>>>>>> +                                items, num_of_items, error);
>>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>>     return ret;
>>>>>>>> }
>>>>>>>> @@ -5328,7 +5374,8 @@
>>>> netdev_dpdk_rte_flow_get_restore_info(struct
>>>>>>>> netdev *netdev,
>>>>>>>>
>>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>>> -    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
>>>>>>>> +    ret = rte_flow_get_restore_info(dev-
>>> flow_transfer_proxy_port_id,
>>>>>>>> +                                    m, info, error);
>>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>>     return ret;
>>>>>>>> }
>>>>>>>> @@ -5349,8 +5396,8 @@
>>>>>>>> netdev_dpdk_rte_flow_tunnel_action_decap_release(
>>>>>>>>
>>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>>> -    ret = rte_flow_tunnel_action_decap_release(dev->port_id,
>> actions,
>>>>>>>> -                                               num_of_actions, error);
>>>>>>>> +    ret = rte_flow_tunnel_action_decap_release(dev-
>>>>>>>>> flow_transfer_proxy_port_id,
>>>>>>>> +                                               actions,
>>>>>>>> + num_of_actions, error);
>>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>>     return ret;
>>>>>>>> }
>>>>>>>> @@ -5370,8 +5417,8 @@
>>>>>> netdev_dpdk_rte_flow_tunnel_item_release(struct
>>>>>>>> netdev *netdev,
>>>>>>>>
>>>>>>>>     dev = netdev_dpdk_cast(netdev);
>>>>>>>>     ovs_mutex_lock(&dev->mutex);
>>>>>>>> -    ret = rte_flow_tunnel_item_release(dev->port_id, items,
>>>>>> num_of_items,
>>>>>>>> -                                       error);
>>>>>>>> +    ret = rte_flow_tunnel_item_release(dev-
>>>>> flow_transfer_proxy_port_id,
>>>>>>>> +                                       items, num_of_items,
>>>>>>>> + error);
>>>>>>>>     ovs_mutex_unlock(&dev->mutex);
>>>>>>>>     return ret;
>>>>>>>> }
>>>>>>>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index
>>>>>>>> 699be3fb4..1dd5953a4 100644
>>>>>>>> --- a/lib/netdev-dpdk.h
>>>>>>>> +++ b/lib/netdev-dpdk.h
>>>>>>>> @@ -35,7 +35,7 @@ bool netdev_dpdk_flow_api_supported(struct
>>>> netdev
>>>>>>>> *);
>>>>>>>>
>>>>>>>> int
>>>>>>>> netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
>>>>>>>> -                             struct rte_flow *rte_flow,
>>>>>>>> +                             bool transfer, struct rte_flow
>>>>>>>> + *rte_flow,
>>>>>>>>                              struct rte_flow_error *error);
>>>>>>>> struct rte_flow * netdev_dpdk_rte_flow_create(struct netdev
>>>>>>>> *netdev, diff --git a/lib/netdev-offload-dpdk.c
>>>>>>>> b/lib/netdev-offload-dpdk.c index
>>>>>>>> 9cd5a0159..f8b90cbf7 100644
>>>>>>>> --- a/lib/netdev-offload-dpdk.c
>>>>>>>> +++ b/lib/netdev-offload-dpdk.c
>>>>>>>> @@ -353,8 +353,23 @@ dump_flow_pattern(struct ds *s,
>>>>>>>>
>>>>>>>>     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
>>>>>>>>         ds_put_cstr(s, "end ");
>>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>>> +    } else if (item->type ==
>>>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
>>>>>>>> +        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
>>>>>>>> +        const struct rte_flow_item_ethdev *ethdev_mask =
>>>>>>>> +item->mask;
>>>>>>>> +
>>>>>>>> +        ds_put_cstr(s, "represented_port ");
>>>>>>>> +
>>>>>>>> +        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false,
>>>>>> "ethdev_port_id",
>>>>>>>> +                          "%"PRIu16, ethdev_spec->port_id,
>>>>>>>> +                          ethdev_mask->port_id, 0);
>>>>>>>> +    } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>>>>> +               pattern_index < 1 /* REPRESENTED_PORT */ +
>>>>>>>> +                               flow_patterns->tnl_pmd_items_cnt)
>>>>>>>> +{ #else /* ! ALLOW_EXPERIMENTAL_API */
>>>>>>>>     } else if (flow_patterns->tnl_pmd_items_cnt &&
>>>>>>>>                pattern_index < flow_patterns->tnl_pmd_items_cnt)
>>>>>>>> {
>>>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>>         return;
>>>>>>>>     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
>>>>>>>>         const struct rte_flow_item_eth *eth_spec = item->spec; @@
>>>>>>>> -1035,6 +1050,12 @@ free_flow_patterns(struct flow_patterns
>>>>>>>> *patterns)
>>>>>>>>     struct rte_flow_error error;
>>>>>>>>     int i;
>>>>>>>>
>>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>>> +    /* REPRESENTED_PORT */
>>>>>>>> +    free(CONST_CAST(void *, patterns->items[0].spec));
>>>>>>>> +    free(CONST_CAST(void *, patterns->items[0].mask)); #endif /*
>>>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>>>> +
>>>>>>>>     if (patterns->tnl_pmd_items) {
>>>>>>>>         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
>>>>>>>>         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt;
>>>>>>>> @@
>>>>>>>> -1049,7 +1070,12 @@ free_flow_patterns(struct flow_patterns
>>>>>>>> *patterns)
>>>>>>>>         }
>>>>>>>>     }
>>>>>>>>
>>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>>> +    for (i = 1 /* REPRESENTED_PORT */ + patterns-
>>> tnl_pmd_items_cnt;
>>>>>>>> +         i < patterns->cnt; i++) { #else /* !
>>>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>>>>     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++)
>>>>>>>> {
>>>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>>         if (patterns->items[i].spec) {
>>>>>>>>             free(CONST_CAST(void *, patterns->items[i].spec));
>>>>>>>>         }
>>>>>>>> @@ -1383,10 +1409,23 @@ parse_flow_match(struct netdev
>> *netdev,
>>>>>>>>                  struct flow_patterns *patterns,
>>>>>>>>                  struct match *match) {
>>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>>> +    struct netdev *physdev = netdev_ports_get(orig_in_port,
>>>>>>>> +netdev-
>>>>>>>>> dpif_type);
>>>>>>>> +    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof
>>>>>> *ethdev_spec);
>>>>>>>> +    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof
>>>>>>>> *ethdev_mask);
>>>>>>>> +#endif /* ALLOW_EXPERIMENTAL_API */
>>>>>>>>     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
>>>>>>>>     struct flow *consumed_masks;
>>>>>>>>     uint8_t proto = 0;
>>>>>>>>
>>>>>>>> +#ifdef ALLOW_EXPERIMENTAL_API
>>>>>>>> +    /* Add an explicit traffic source item to the beginning of the
>> pattern.
>>>> */
>>>>>>>> +    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
>>>>>>>> +    *ethdev_mask = rte_flow_item_ethdev_mask;
>>>>>>>> +    add_flow_pattern(patterns,
>>>>>>>> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
>>>>>>>> +                     ethdev_spec, ethdev_mask, NULL); #endif /*
>>>>>>>> +ALLOW_EXPERIMENTAL_API */
>>>>>>>> +
>>>>>>>>     consumed_masks = &match->wc.masks;
>>>>>>>>
>>>>>>>>     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) { @@ -2333,6
>>>>>>>> +2372,7 @@ netdev_offload_dpdk_flow_destroy(struct
>>>>>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>>>>>     struct netdev *physdev;
>>>>>>>>     struct netdev *netdev;
>>>>>>>>     ovs_u128 *ufid;
>>>>>>>> +    bool transfer;
>>>>>>>>     int ret;
>>>>>>>>
>>>>>>>>     ovs_mutex_lock(&rte_flow_data->lock);
>>>>>>>> @@ -2344,12 +2384,13 @@
>> netdev_offload_dpdk_flow_destroy(struct
>>>>>>>> ufid_to_rte_flow_data *rte_flow_data)
>>>>>>>>
>>>>>>>>     rte_flow_data->dead = true;
>>>>>>>>
>>>>>>>> +    transfer = rte_flow_data->actions_offloaded;
>>>>>>>>     rte_flow = rte_flow_data->rte_flow;
>>>>>>>>     physdev = rte_flow_data->physdev;
>>>>>>>>     netdev = rte_flow_data->netdev;
>>>>>>>>     ufid = &rte_flow_data->ufid;
>>>>>>>>
>>>>>>>> -    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
>>>>>>>> +    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer,
>>>>>>>> + rte_flow, &error);
>>>>>>>>
>>>>>>>>     if (ret == 0) {
>>>>>>>>         struct netdev_offload_dpdk_data *data;
>>>>>>>> --
>>>>>>>> 2.30.2
>>>>>>>
>>>>>>>
>>>>>
>>>
>

--
Best regards,
Ivan M
diff mbox series

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 45e5d26d2..d0bf4613a 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -420,6 +420,7 @@  enum dpdk_hw_ol_features {
 
 struct netdev_dpdk {
     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
+        dpdk_port_t flow_transfer_proxy_port_id;
         dpdk_port_t port_id;
 
         /* If true, device was attached by rte_eth_dev_attach(). */
@@ -1115,6 +1116,23 @@  dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev)
                   DPDK_PORT_ID_FMT, dev->port_id);
     }
 }
+
+static void
+dpdk_eth_dev_init_flow_transfer_proxy(struct netdev_dpdk *dev)
+{
+    int ret;
+
+    ret = rte_flow_pick_transfer_proxy(dev->port_id,
+                                       &dev->flow_transfer_proxy_port_id, NULL);
+    if (ret == 0)
+        return;
+
+    /*
+     * The PMD does not indicate the proxy port.
+     * It is OK to assume the proxy is unneeded.
+     */
+    dev->flow_transfer_proxy_port_id = dev->port_id;
+}
 #endif /* ALLOW_EXPERIMENTAL_API */
 
 static int
@@ -1141,6 +1159,19 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev)
      * Request delivery of such metadata.
      */
     dpdk_eth_dev_init_rx_metadata(dev);
+
+    /*
+     * Managing "transfer" flows requires that the user communicate them
+     * via a port which has the privilege to control the embedded switch.
+     * For some vendors, all ports in a given switching domain have
+     * this privilege. For other vendors, it's only one port.
+     *
+     * Get the proxy port ID and remember it for later use.
+     */
+    dpdk_eth_dev_init_flow_transfer_proxy(dev);
+#else /* ! ALLOW_EXPERIMENTAL_API */
+    /* It is OK to assume the proxy is unneeded. */
+    dev->flow_transfer_proxy_port_id = dev->port_id;
 #endif /* ALLOW_EXPERIMENTAL_API */
 
     rte_eth_dev_info_get(dev->port_id, &info);
@@ -5214,13 +5245,15 @@  out:
 
 int
 netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
-                             struct rte_flow *rte_flow,
+                             bool transfer, struct rte_flow *rte_flow,
                              struct rte_flow_error *error)
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
     int ret;
 
-    ret = rte_flow_destroy(dev->port_id, rte_flow, error);
+    ret = rte_flow_destroy(transfer ?
+                           dev->flow_transfer_proxy_port_id : dev->port_id,
+                           rte_flow, error);
     return ret;
 }
 
@@ -5234,7 +5267,19 @@  netdev_dpdk_rte_flow_create(struct netdev *netdev,
     struct rte_flow *flow;
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 
-    flow = rte_flow_create(dev->port_id, attr, items, actions, error);
+#ifdef ALLOW_EXPERIMENTAL_API
+    if (!attr->transfer) {
+        /*
+         * The 1st item in any pattern is a traffic source one.
+         * It is unnecessary in the case of non-transfer rules.
+         */
+        ++(items);
+    }
+#endif /* ALLOW_EXPERIMENTAL_API */
+
+    flow = rte_flow_create(attr->transfer ?
+                           dev->flow_transfer_proxy_port_id : dev->port_id,
+                           attr, items, actions, error);
     return flow;
 }
 
@@ -5262,7 +5307,8 @@  netdev_dpdk_rte_flow_query_count(struct netdev *netdev,
     }
 
     dev = netdev_dpdk_cast(netdev);
-    ret = rte_flow_query(dev->port_id, rte_flow, actions, query, error);
+    ret = rte_flow_query(dev->flow_transfer_proxy_port_id, rte_flow,
+                         actions, query, error);
     return ret;
 }
 
@@ -5284,8 +5330,8 @@  netdev_dpdk_rte_flow_tunnel_decap_set(struct netdev *netdev,
 
     dev = netdev_dpdk_cast(netdev);
     ovs_mutex_lock(&dev->mutex);
-    ret = rte_flow_tunnel_decap_set(dev->port_id, tunnel, actions,
-                                    num_of_actions, error);
+    ret = rte_flow_tunnel_decap_set(dev->flow_transfer_proxy_port_id, tunnel,
+                                    actions, num_of_actions, error);
     ovs_mutex_unlock(&dev->mutex);
     return ret;
 }
@@ -5306,8 +5352,8 @@  netdev_dpdk_rte_flow_tunnel_match(struct netdev *netdev,
 
     dev = netdev_dpdk_cast(netdev);
     ovs_mutex_lock(&dev->mutex);
-    ret = rte_flow_tunnel_match(dev->port_id, tunnel, items, num_of_items,
-                                error);
+    ret = rte_flow_tunnel_match(dev->flow_transfer_proxy_port_id, tunnel,
+                                items, num_of_items, error);
     ovs_mutex_unlock(&dev->mutex);
     return ret;
 }
@@ -5328,7 +5374,8 @@  netdev_dpdk_rte_flow_get_restore_info(struct netdev *netdev,
 
     dev = netdev_dpdk_cast(netdev);
     ovs_mutex_lock(&dev->mutex);
-    ret = rte_flow_get_restore_info(dev->port_id, m, info, error);
+    ret = rte_flow_get_restore_info(dev->flow_transfer_proxy_port_id,
+                                    m, info, error);
     ovs_mutex_unlock(&dev->mutex);
     return ret;
 }
@@ -5349,8 +5396,8 @@  netdev_dpdk_rte_flow_tunnel_action_decap_release(
 
     dev = netdev_dpdk_cast(netdev);
     ovs_mutex_lock(&dev->mutex);
-    ret = rte_flow_tunnel_action_decap_release(dev->port_id, actions,
-                                               num_of_actions, error);
+    ret = rte_flow_tunnel_action_decap_release(dev->flow_transfer_proxy_port_id,
+                                               actions, num_of_actions, error);
     ovs_mutex_unlock(&dev->mutex);
     return ret;
 }
@@ -5370,8 +5417,8 @@  netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *netdev,
 
     dev = netdev_dpdk_cast(netdev);
     ovs_mutex_lock(&dev->mutex);
-    ret = rte_flow_tunnel_item_release(dev->port_id, items, num_of_items,
-                                       error);
+    ret = rte_flow_tunnel_item_release(dev->flow_transfer_proxy_port_id,
+                                       items, num_of_items, error);
     ovs_mutex_unlock(&dev->mutex);
     return ret;
 }
diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
index 699be3fb4..1dd5953a4 100644
--- a/lib/netdev-dpdk.h
+++ b/lib/netdev-dpdk.h
@@ -35,7 +35,7 @@  bool netdev_dpdk_flow_api_supported(struct netdev *);
 
 int
 netdev_dpdk_rte_flow_destroy(struct netdev *netdev,
-                             struct rte_flow *rte_flow,
+                             bool transfer, struct rte_flow *rte_flow,
                              struct rte_flow_error *error);
 struct rte_flow *
 netdev_dpdk_rte_flow_create(struct netdev *netdev,
diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
index 9cd5a0159..f8b90cbf7 100644
--- a/lib/netdev-offload-dpdk.c
+++ b/lib/netdev-offload-dpdk.c
@@ -353,8 +353,23 @@  dump_flow_pattern(struct ds *s,
 
     if (item->type == RTE_FLOW_ITEM_TYPE_END) {
         ds_put_cstr(s, "end ");
+#ifdef ALLOW_EXPERIMENTAL_API
+    } else if (item->type == RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT) {
+        const struct rte_flow_item_ethdev *ethdev_spec = item->spec;
+        const struct rte_flow_item_ethdev *ethdev_mask = item->mask;
+
+        ds_put_cstr(s, "represented_port ");
+
+        DUMP_PATTERN_ITEM(ethdev_mask->port_id, false, "ethdev_port_id",
+                          "%"PRIu16, ethdev_spec->port_id,
+                          ethdev_mask->port_id, 0);
+    } else if (flow_patterns->tnl_pmd_items_cnt &&
+               pattern_index < 1 /* REPRESENTED_PORT */ +
+                               flow_patterns->tnl_pmd_items_cnt) {
+#else /* ! ALLOW_EXPERIMENTAL_API */
     } else if (flow_patterns->tnl_pmd_items_cnt &&
                pattern_index < flow_patterns->tnl_pmd_items_cnt) {
+#endif /* ALLOW_EXPERIMENTAL_API */
         return;
     } else if (item->type == RTE_FLOW_ITEM_TYPE_ETH) {
         const struct rte_flow_item_eth *eth_spec = item->spec;
@@ -1035,6 +1050,12 @@  free_flow_patterns(struct flow_patterns *patterns)
     struct rte_flow_error error;
     int i;
 
+#ifdef ALLOW_EXPERIMENTAL_API
+    /* REPRESENTED_PORT */
+    free(CONST_CAST(void *, patterns->items[0].spec));
+    free(CONST_CAST(void *, patterns->items[0].mask));
+#endif /* ALLOW_EXPERIMENTAL_API */
+
     if (patterns->tnl_pmd_items) {
         struct rte_flow_item *tnl_pmd_items = patterns->tnl_pmd_items;
         uint32_t tnl_pmd_items_cnt = patterns->tnl_pmd_items_cnt;
@@ -1049,7 +1070,12 @@  free_flow_patterns(struct flow_patterns *patterns)
         }
     }
 
+#ifdef ALLOW_EXPERIMENTAL_API
+    for (i = 1 /* REPRESENTED_PORT */ + patterns->tnl_pmd_items_cnt;
+         i < patterns->cnt; i++) {
+#else /* ! ALLOW_EXPERIMENTAL_API */
     for (i = patterns->tnl_pmd_items_cnt; i < patterns->cnt; i++) {
+#endif /* ALLOW_EXPERIMENTAL_API */
         if (patterns->items[i].spec) {
             free(CONST_CAST(void *, patterns->items[i].spec));
         }
@@ -1383,10 +1409,23 @@  parse_flow_match(struct netdev *netdev,
                  struct flow_patterns *patterns,
                  struct match *match)
 {
+#ifdef ALLOW_EXPERIMENTAL_API
+    struct netdev *physdev = netdev_ports_get(orig_in_port, netdev->dpif_type);
+    struct rte_flow_item_ethdev *ethdev_spec = xzalloc(sizeof *ethdev_spec);
+    struct rte_flow_item_ethdev *ethdev_mask = xzalloc(sizeof *ethdev_mask);
+#endif /* ALLOW_EXPERIMENTAL_API */
     struct rte_flow_item_eth *eth_spec = NULL, *eth_mask = NULL;
     struct flow *consumed_masks;
     uint8_t proto = 0;
 
+#ifdef ALLOW_EXPERIMENTAL_API
+    /* Add an explicit traffic source item to the beginning of the pattern. */
+    ethdev_spec->port_id = netdev_dpdk_get_port_id(physdev);
+    *ethdev_mask = rte_flow_item_ethdev_mask;
+    add_flow_pattern(patterns, RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
+                     ethdev_spec, ethdev_mask, NULL);
+#endif /* ALLOW_EXPERIMENTAL_API */
+
     consumed_masks = &match->wc.masks;
 
     if (!flow_tnl_dst_is_set(&match->flow.tunnel)) {
@@ -2333,6 +2372,7 @@  netdev_offload_dpdk_flow_destroy(struct ufid_to_rte_flow_data *rte_flow_data)
     struct netdev *physdev;
     struct netdev *netdev;
     ovs_u128 *ufid;
+    bool transfer;
     int ret;
 
     ovs_mutex_lock(&rte_flow_data->lock);
@@ -2344,12 +2384,13 @@  netdev_offload_dpdk_flow_destroy(struct ufid_to_rte_flow_data *rte_flow_data)
 
     rte_flow_data->dead = true;
 
+    transfer = rte_flow_data->actions_offloaded;
     rte_flow = rte_flow_data->rte_flow;
     physdev = rte_flow_data->physdev;
     netdev = rte_flow_data->netdev;
     ufid = &rte_flow_data->ufid;
 
-    ret = netdev_dpdk_rte_flow_destroy(physdev, rte_flow, &error);
+    ret = netdev_dpdk_rte_flow_destroy(physdev, transfer, rte_flow, &error);
 
     if (ret == 0) {
         struct netdev_offload_dpdk_data *data;