[ovs-dev,RFC,v7,11/13] netdev-dpdk: copy large packet to multi-seg. mbufs

Message ID 1527094048-105084-12-git-send-email-tiago.lam@intel.com
State Superseded
Headers show
Series
  • Support multi-segment mbufs
Related show

Commit Message

Tiago Lam May 23, 2018, 4:47 p.m.
From: Mark Kavanagh <mark.b.kavanagh@intel.com>

Currently, packets are only copied to a single segment in
the function dpdk_do_tx_copy(). This could be an issue in
the case of jumbo frames, particularly when multi-segment
mbufs are involved.

This patch calculates the number of segments needed by a
packet and copies the data to each segment.

Co-authored-by: Michael Qiu <qiudayu@chinac.com>

Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Michael Qiu <qiudayu@chinac.com>
---
 lib/netdev-dpdk.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 68 insertions(+), 10 deletions(-)

Comments

Loftus, Ciara May 28, 2018, 3:43 p.m. | #1
> 
> From: Mark Kavanagh <mark.b.kavanagh@intel.com>
> 
> Currently, packets are only copied to a single segment in
> the function dpdk_do_tx_copy(). This could be an issue in
> the case of jumbo frames, particularly when multi-segment
> mbufs are involved.
> 
> This patch calculates the number of segments needed by a
> packet and copies the data to each segment.
> 
> Co-authored-by: Michael Qiu <qiudayu@chinac.com>
> 
> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> Signed-off-by: Michael Qiu <qiudayu@chinac.com>
> ---
>  lib/netdev-dpdk.c | 78
> ++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 68 insertions(+), 10 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index d3abdde..c6dfe6d 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -2178,6 +2178,71 @@ out:
>      }
>  }
> 
> +static int
> +dpdk_prep_tx_buf(struct dp_packet *packet, struct rte_mbuf **head,
> +                 struct rte_mempool *mp)
> +{
> +    struct rte_mbuf *temp;
> +    uint32_t size = dp_packet_size(packet);
> +    uint16_t max_data_len, data_len;
> +    uint32_t nb_segs = 0;
> +    int i;
> +
> +    temp = *head = rte_pktmbuf_alloc(mp);
> +    if (OVS_UNLIKELY(!temp)) {
> +        return 1;
> +    }
> +
> +    /* All new allocated mbuf's max data len is the same */
> +    max_data_len = temp->buf_len - temp->data_off;
> +
> +    /* Calculate # of output mbufs. */
> +    nb_segs = size / max_data_len;
> +    if (size % max_data_len) {
> +        nb_segs = nb_segs + 1;
> +    }
> +
> +    /* Allocate additional mbufs when multiple output mbufs required. */
> +    for (i = 1; i < nb_segs; i++) {
> +        temp->next = rte_pktmbuf_alloc(mp);
> +        if (!temp->next) {
> +            rte_pktmbuf_free(*head);
> +            *head = NULL;
> +            break;
> +        }
> +        temp = temp->next;
> +    }
> +    /* We have to do a copy for now */
> +    rte_pktmbuf_pkt_len(*head) = size;
> +    temp = *head;
> +
> +    data_len = size < max_data_len ? size: max_data_len;
> +    if (packet->source == DPBUF_DPDK) {
> +        *head = &(packet->mbuf);
> +        while (temp && head && size > 0) {
> +            rte_memcpy(rte_pktmbuf_mtod(temp, void *),
> +                    dp_packet_data((struct dp_packet *)head), data_len);
> +            rte_pktmbuf_data_len(temp) = data_len;
> +            *head = (*head)->next;
> +            size = size - data_len;
> +            data_len =  size < max_data_len ? size: max_data_len;
> +            temp = temp->next;
> +        }
> +    } else {
> +        int offset = 0;
> +        while (temp && size > 0) {
> +            memcpy(rte_pktmbuf_mtod(temp, void *),
> +                    dp_packet_at(packet, offset, data_len), data_len);
> +            rte_pktmbuf_data_len(temp) = data_len;
> +            temp = temp->next;
> +            size = size - data_len;
> +            offset += data_len;
> +            data_len = size < max_data_len ? size: max_data_len;
> +        }
> +    }
> +    return 0;
> +}
> +
>  /* Tx function. Transmit packets indefinitely */
>  static void
>  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch
> *batch)
> @@ -2194,6 +2259,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid,
> struct dp_packet_batch *batch)
>      struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
>      uint32_t cnt = batch_cnt;
>      uint32_t dropped = 0;
> +    uint32_t i;
> 
>      if (dev->type != DPDK_DEV_VHOST) {
>          /* Check if QoS has been configured for this netdev. */
> @@ -2204,27 +2270,19 @@ dpdk_do_tx_copy(struct netdev *netdev, int
> qid, struct dp_packet_batch *batch)
> 
>      uint32_t txcnt = 0;
> 
> -    for (uint32_t i = 0; i < cnt; i++) {
> +    for (i = 0; i < cnt; i++) {
>          struct dp_packet *packet = batch->packets[i];
>          uint32_t size = dp_packet_size(packet);
> -
>          if (OVS_UNLIKELY(size > dev->max_packet_len)) {
>              VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
>                           size, dev->max_packet_len);
> -
>              dropped++;
>              continue;
>          }
> -
> -        pkts[txcnt] = rte_pktmbuf_alloc(dev->mp);
> -        if (OVS_UNLIKELY(!pkts[txcnt])) {
> +        if (!dpdk_prep_tx_buf(packet, &pkts[txcnt], dev->mp)) {

We should enter here if the return is not zero.

>              dropped += cnt - i;
>              break;
>          }
> -
> -        /* We have to do a copy for now */
> -        memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
> -               dp_packet_data(packet), size);
>          dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
>          dp_packet_copy_mbuf_flags((struct dp_packet *)pkts[txcnt], packet);
> 
> --
> 2.7.4
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Ilya Maximets June 5, 2018, 12:03 p.m. | #2
On 23.05.2018 19:47, Tiago Lam wrote:
> From: Mark Kavanagh <mark.b.kavanagh@intel.com>
> 
> Currently, packets are only copied to a single segment in
> the function dpdk_do_tx_copy(). This could be an issue in
> the case of jumbo frames, particularly when multi-segment
> mbufs are involved.
> 
> This patch calculates the number of segments needed by a
> packet and copies the data to each segment.
> 
> Co-authored-by: Michael Qiu <qiudayu@chinac.com>
> 
> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> Signed-off-by: Michael Qiu <qiudayu@chinac.com>
> ---
>  lib/netdev-dpdk.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 68 insertions(+), 10 deletions(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index d3abdde..c6dfe6d 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -2178,6 +2178,71 @@ out:
>      }
>  }
>  
> +static int
> +dpdk_prep_tx_buf(struct dp_packet *packet, struct rte_mbuf **head,
> +                 struct rte_mempool *mp)
> +{
> +    struct rte_mbuf *temp;
> +    uint32_t size = dp_packet_size(packet);
> +    uint16_t max_data_len, data_len;
> +    uint32_t nb_segs = 0;
> +    int i;
> +
> +    temp = *head = rte_pktmbuf_alloc(mp);

All the functions around OVS code *must* use 'dpdk_buf_alloc()' instead
of 'rte_pktmbuf_alloc()'.  Otherwise, the introduced 'nonpmd_mp_mutex'
is useless.

Also, all the calls to 'rte_pktmbuf_free()' must be replaced with
'free_dpdk_buf()' for the same reason.
And it's better to do this right in the patch 08/13 and maintain in
the subsequent patches.

> +    if (OVS_UNLIKELY(!temp)) {
> +        return 1;
> +    }
> +
> +    /* All new allocated mbuf's max data len is the same */
> +    max_data_len = temp->buf_len - temp->data_off;
> +
> +    /* Calculate # of output mbufs. */
> +    nb_segs = size / max_data_len;
> +    if (size % max_data_len) {
> +        nb_segs = nb_segs + 1;
> +    }
> +
> +    /* Allocate additional mbufs when multiple output mbufs required. */
> +    for (i = 1; i < nb_segs; i++) {
> +        temp->next = rte_pktmbuf_alloc(mp);
> +        if (!temp->next) {
> +            rte_pktmbuf_free(*head);
> +            *head = NULL;
> +            break;
> +        }
> +        temp = temp->next;
> +    }
> +    /* We have to do a copy for now */
> +    rte_pktmbuf_pkt_len(*head) = size;
> +    temp = *head;
> +
> +    data_len = size < max_data_len ? size: max_data_len;
> +    if (packet->source == DPBUF_DPDK) {
> +        *head = &(packet->mbuf);
> +        while (temp && head && size > 0) {
> +            rte_memcpy(rte_pktmbuf_mtod(temp, void *),
> +                    dp_packet_data((struct dp_packet *)head), data_len);
> +            rte_pktmbuf_data_len(temp) = data_len;
> +            *head = (*head)->next;
> +            size = size - data_len;
> +            data_len =  size < max_data_len ? size: max_data_len;
> +            temp = temp->next;
> +        }
> +    } else {
> +        int offset = 0;
> +        while (temp && size > 0) {
> +            memcpy(rte_pktmbuf_mtod(temp, void *),
> +                    dp_packet_at(packet, offset, data_len), data_len);
> +            rte_pktmbuf_data_len(temp) = data_len;
> +            temp = temp->next;
> +            size = size - data_len;
> +            offset += data_len;
> +            data_len = size < max_data_len ? size: max_data_len;
> +        }
> +    }
> +    return 0;
> +}
> +
>  /* Tx function. Transmit packets indefinitely */
>  static void
>  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
> @@ -2194,6 +2259,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>      struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
>      uint32_t cnt = batch_cnt;
>      uint32_t dropped = 0;
> +    uint32_t i;
>  
>      if (dev->type != DPDK_DEV_VHOST) {
>          /* Check if QoS has been configured for this netdev. */
> @@ -2204,27 +2270,19 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>  
>      uint32_t txcnt = 0;
>  
> -    for (uint32_t i = 0; i < cnt; i++) {
> +    for (i = 0; i < cnt; i++) {
>          struct dp_packet *packet = batch->packets[i];
>          uint32_t size = dp_packet_size(packet);
> -
>          if (OVS_UNLIKELY(size > dev->max_packet_len)) {
>              VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
>                           size, dev->max_packet_len);
> -
>              dropped++;
>              continue;
>          }
> -
> -        pkts[txcnt] = rte_pktmbuf_alloc(dev->mp);
> -        if (OVS_UNLIKELY(!pkts[txcnt])) {
> +        if (!dpdk_prep_tx_buf(packet, &pkts[txcnt], dev->mp)) {
>              dropped += cnt - i;
>              break;
>          }
> -
> -        /* We have to do a copy for now */
> -        memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
> -               dp_packet_data(packet), size);
>          dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
>          dp_packet_copy_mbuf_flags((struct dp_packet *)pkts[txcnt], packet);
>  
>
Tiago Lam June 8, 2018, 9:10 a.m. | #3
On 05/06/2018 13:03, Ilya Maximets wrote:
> On 23.05.2018 19:47, Tiago Lam wrote:
>> From: Mark Kavanagh <mark.b.kavanagh@intel.com>
>>
>> Currently, packets are only copied to a single segment in
>> the function dpdk_do_tx_copy(). This could be an issue in
>> the case of jumbo frames, particularly when multi-segment
>> mbufs are involved.
>>
>> This patch calculates the number of segments needed by a
>> packet and copies the data to each segment.
>>
>> Co-authored-by: Michael Qiu <qiudayu@chinac.com>
>>
>> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>> Signed-off-by: Michael Qiu <qiudayu@chinac.com>
>> ---
>>  lib/netdev-dpdk.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++-------
>>  1 file changed, 68 insertions(+), 10 deletions(-)
>>
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>> index d3abdde..c6dfe6d 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -2178,6 +2178,71 @@ out:
>>      }
>>  }
>>  
>> +static int
>> +dpdk_prep_tx_buf(struct dp_packet *packet, struct rte_mbuf **head,
>> +                 struct rte_mempool *mp)
>> +{
>> +    struct rte_mbuf *temp;
>> +    uint32_t size = dp_packet_size(packet);
>> +    uint16_t max_data_len, data_len;
>> +    uint32_t nb_segs = 0;
>> +    int i;
>> +
>> +    temp = *head = rte_pktmbuf_alloc(mp);
> 
> All the functions around OVS code *must* use 'dpdk_buf_alloc()' instead
> of 'rte_pktmbuf_alloc()'.  Otherwise, the introduced 'nonpmd_mp_mutex'
> is useless.
> 
> Also, all the calls to 'rte_pktmbuf_free()' must be replaced with
> 'free_dpdk_buf()' for the same reason.
> And it's better to do this right in the patch 08/13 and maintain in
> the subsequent patches.
> 

Hi Ilya,

Thanks for pointing that out. I did miss the call to `free_dpdk_buf()`
in `dp_packet_set_size()` in patch 04/13. However, from what I can see
allocating mbufs directly from 'rte_pktmbuf_free()' was already
happening before this patch, in `dpdk_do_tx_copy()`. This patch just
happens to allocate more mbufs if multi-segment mbufs are being used.

Now, this could have been introduced in the past and been overlooked,
but it seems `dpdk_do_tx_copy()` is only called from
`netdev_dpdk_vhost_send()` and `netdev_dpdk_send__()`, and that seems to
be called with the `non_pmd_mutex` mutex held already (in
dpif_netdev.c). Am I missing other call path here? Otherwise it seems to
be safe.

I'm just confirming that this is the case though, as your suggestions do
make sense to keep the code uniform, since we now have the wrappers
around the `nonpm_mp_mutex` mutex (and because it shouldn't increase
contention, since the `dp->non_pmd_mutex` should already be held for
non-pmds for this call path). I'll include it for the next iteration.

Regards,
Tiago.

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index d3abdde..c6dfe6d 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2178,6 +2178,71 @@  out:
     }
 }
 
+static int
+dpdk_prep_tx_buf(struct dp_packet *packet, struct rte_mbuf **head,
+                 struct rte_mempool *mp)
+{
+    struct rte_mbuf *temp;
+    uint32_t size = dp_packet_size(packet);
+    uint16_t max_data_len, data_len;
+    uint32_t nb_segs = 0;
+    int i;
+
+    temp = *head = rte_pktmbuf_alloc(mp);
+    if (OVS_UNLIKELY(!temp)) {
+        return 1;
+    }
+
+    /* All new allocated mbuf's max data len is the same */
+    max_data_len = temp->buf_len - temp->data_off;
+
+    /* Calculate # of output mbufs. */
+    nb_segs = size / max_data_len;
+    if (size % max_data_len) {
+        nb_segs = nb_segs + 1;
+    }
+
+    /* Allocate additional mbufs when multiple output mbufs required. */
+    for (i = 1; i < nb_segs; i++) {
+        temp->next = rte_pktmbuf_alloc(mp);
+        if (!temp->next) {
+            rte_pktmbuf_free(*head);
+            *head = NULL;
+            break;
+        }
+        temp = temp->next;
+    }
+    /* We have to do a copy for now */
+    rte_pktmbuf_pkt_len(*head) = size;
+    temp = *head;
+
+    data_len = size < max_data_len ? size: max_data_len;
+    if (packet->source == DPBUF_DPDK) {
+        *head = &(packet->mbuf);
+        while (temp && head && size > 0) {
+            rte_memcpy(rte_pktmbuf_mtod(temp, void *),
+                    dp_packet_data((struct dp_packet *)head), data_len);
+            rte_pktmbuf_data_len(temp) = data_len;
+            *head = (*head)->next;
+            size = size - data_len;
+            data_len =  size < max_data_len ? size: max_data_len;
+            temp = temp->next;
+        }
+    } else {
+        int offset = 0;
+        while (temp && size > 0) {
+            memcpy(rte_pktmbuf_mtod(temp, void *),
+                    dp_packet_at(packet, offset, data_len), data_len);
+            rte_pktmbuf_data_len(temp) = data_len;
+            temp = temp->next;
+            size = size - data_len;
+            offset += data_len;
+            data_len = size < max_data_len ? size: max_data_len;
+        }
+    }
+    return 0;
+}
+
 /* Tx function. Transmit packets indefinitely */
 static void
 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
@@ -2194,6 +2259,7 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
     struct rte_mbuf *pkts[PKT_ARRAY_SIZE];
     uint32_t cnt = batch_cnt;
     uint32_t dropped = 0;
+    uint32_t i;
 
     if (dev->type != DPDK_DEV_VHOST) {
         /* Check if QoS has been configured for this netdev. */
@@ -2204,27 +2270,19 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
 
     uint32_t txcnt = 0;
 
-    for (uint32_t i = 0; i < cnt; i++) {
+    for (i = 0; i < cnt; i++) {
         struct dp_packet *packet = batch->packets[i];
         uint32_t size = dp_packet_size(packet);
-
         if (OVS_UNLIKELY(size > dev->max_packet_len)) {
             VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
                          size, dev->max_packet_len);
-
             dropped++;
             continue;
         }
-
-        pkts[txcnt] = rte_pktmbuf_alloc(dev->mp);
-        if (OVS_UNLIKELY(!pkts[txcnt])) {
+        if (!dpdk_prep_tx_buf(packet, &pkts[txcnt], dev->mp)) {
             dropped += cnt - i;
             break;
         }
-
-        /* We have to do a copy for now */
-        memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *),
-               dp_packet_data(packet), size);
         dp_packet_set_size((struct dp_packet *)pkts[txcnt], size);
         dp_packet_copy_mbuf_flags((struct dp_packet *)pkts[txcnt], packet);