diff mbox

[ovs-dev,RFC,v2,1/1] netdev-dpdk: enable multi-segment jumbo frames

Message ID 1494843429-20385-2-git-send-email-mark.b.kavanagh@intel.com
State Deferred
Headers show

Commit Message

Mark Kavanagh May 15, 2017, 10:17 a.m. UTC
Currently, jumbo frame support for OvS-DPDK is implemented by increasing
the size of mbufs within a mempool, such that each mbuf within the pool
is large enough to contain an entire jumbo frame of a user-defined size.
Typically, for each user-defined MTU 'requested_mtu', a new mempool is
created, containing mbufs of size ~requested_mtu.

With the multi-segment approach, all ports share the same mempool, in
which each mbuf is of standard/default size (~2k MB). To accommodate
jumbo frames, mbufs may be chained together, each mbuf storing a portion
of the jumbo frame; each mbuf in the chain is termed a segment, hence
the name.

Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

---
 
 V2->V1: add missing 'signed-off-by' tag; no code changes

 lib/dpdk.c           |  6 ++++++
 lib/netdev-dpdk.c    | 54 ++++++++++++++++++++++++++++++++++++++++++++++------
 lib/netdev-dpdk.h    |  1 +
 vswitchd/vswitch.xml | 19 ++++++++++++++++++
 4 files changed, 74 insertions(+), 6 deletions(-)

Comments

仇大玉 May 17, 2017, 2:03 a.m. UTC | #1
2017/5/15 18:17, Mark Kavanagh :
> Currently, jumbo frame support for OvS-DPDK is implemented by increasing
> the size of mbufs within a mempool, such that each mbuf within the pool
> is large enough to contain an entire jumbo frame of a user-defined size.
> Typically, for each user-defined MTU 'requested_mtu', a new mempool is
> created, containing mbufs of size ~requested_mtu.
>
> With the multi-segment approach, all ports share the same mempool, in
> which each mbuf is of standard/default size (~2k MB). To accommodate
> jumbo frames, mbufs may be chained together, each mbuf storing a portion
> of the jumbo frame; each mbuf in the chain is termed a segment, hence
> the name.
>
> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>
> ---
>   
>   V2->V1: add missing 'signed-off-by' tag; no code changes
>
>   lib/dpdk.c           |  6 ++++++
>   lib/netdev-dpdk.c    | 54 ++++++++++++++++++++++++++++++++++++++++++++++------
>   lib/netdev-dpdk.h    |  1 +
>   vswitchd/vswitch.xml | 19 ++++++++++++++++++
>   4 files changed, 74 insertions(+), 6 deletions(-)
>
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 8da6c32..7d08e34 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -450,6 +450,12 @@ dpdk_init__(const struct smap *ovs_other_config)
>   
>       /* Finally, register the dpdk classes */
>       netdev_dpdk_register();
> +
> +    bool multi_seg_mbufs_enable = smap_get_bool(ovs_other_config, "dpdk-multi-seg-mbufs", false);
> +    if (multi_seg_mbufs_enable) {
> +        VLOG_INFO("DPDK multi-segment mbufs enabled\n");
> +        netdev_dpdk_multi_segment_mbufs_enable();
> +    }
>   }
>   
>   void
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 34fc54b..82bc0e2 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -58,6 +58,7 @@
>   
>   VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
>   static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> +static bool dpdk_multi_segment_mbufs = false;
>   
>   #define DPDK_PORT_WATCHDOG_INTERVAL 5
>   
> @@ -480,7 +481,7 @@ dpdk_mp_create(int socket_id, int mtu)
>        * when the number of ports and rxqs that utilize a particular mempool can
>        * change dynamically at runtime. For now, use this rough heurisitic.
>        */
> -    if (mtu >= ETHER_MTU) {
> +    if (mtu >= ETHER_MTU || dpdk_multi_segment_mbufs) {
>           mp_size = MAX_NB_MBUF;
>       } else {
>           mp_size = MIN_NB_MBUF;
> @@ -558,17 +559,33 @@ dpdk_mp_put(struct dpdk_mp *dmp)
>       ovs_mutex_unlock(&dpdk_mp_mutex);
>   }
>   
> -/* Tries to allocate new mempool on requested_socket_id with
> - * mbuf size corresponding to requested_mtu.
> +/* Tries to configure a mempool for 'dev' on requested socket_id to accommodate
> + * packets of size 'requested_mtu'. The properties of the mempool's elements
> + * are dependent on the value of 'dpdk_multi_segment_mbufs':
> + * - if 'true', then the mempool contains standard-sized mbufs that are chained
> + *   together to accommodate packets of size 'requested_mtu'. All ports on the
> + *   same socket will share this mempool, irrespective of their MTU.
> + * - if 'false', then a mempool is allocated, the members of which are non-
> + *   standard-sized mbufs. Each mbuf in the mempool is large enough to fully
> + *   accomdate packets of size 'requested_mtu'.
> + *
>    * On success new configuration will be applied.
>    * On error, device will be left unchanged. */
>   static int
>   netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
>       OVS_REQUIRES(dev->mutex)
>   {
> -    uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
> +    uint32_t buf_size = 0;
>       struct dpdk_mp *mp;
>   
> +    /* Contiguous mbufs in use - permit oversized mbufs */
> +    if (!dpdk_multi_segment_mbufs) {
> +        buf_size = dpdk_buf_size(dev->requested_mtu);
> +    } else {
> +        /* multi-segment mbufs - use standard mbuf size */
> +        buf_size = dpdk_buf_size(ETHER_MTU);
> +    }
> +
>       mp = dpdk_mp_get(dev->requested_socket_id, FRAME_LEN_TO_MTU(buf_size));
>       if (!mp) {
>           VLOG_ERR("Failed to create memory pool for netdev "
> @@ -577,7 +594,13 @@ netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
>                    rte_strerror(rte_errno));
>           return rte_errno;
>       } else {
> -        dpdk_mp_put(dev->dpdk_mp);
> +        /* When single-segment mbufs are in use, a new mempool is allocated,
> +         * so release the old one. In the case of multi-segment mbufs, the
> +         * same mempool is used for all MTUs.
> +         */
> +        if (!dpdk_multi_segment_mbufs) {
> +            dpdk_mp_put(dev->dpdk_mp);
> +        }
>           dev->dpdk_mp = mp;
>           dev->mtu = dev->requested_mtu;
>           dev->socket_id = dev->requested_socket_id;
> @@ -639,6 +662,7 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>       int diag = 0;
>       int i;
>       struct rte_eth_conf conf = port_conf;
> +    struct rte_eth_txconf txconf;
>   
>       if (dev->mtu > ETHER_MTU) {
>           conf.rxmode.jumbo_frame = 1;
> @@ -666,9 +690,21 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>               break;
>           }
>   
> +        /* DPDK PMDs typically attempt to use simple or vectorized
> +         * transmit functions, neither of which are compatible with
> +         * multi-segment mbufs. Ensure that these are disabled in the
> +         * multi-segment mbuf case.
> +         */
> +        if (dpdk_multi_segment_mbufs) {
> +            memset(&txconf, 0, sizeof(txconf));
> +            txconf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
> +        }
> +
>           for (i = 0; i < n_txq; i++) {
>               diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
> -                                          dev->socket_id, NULL);
> +                                          dev->socket_id,
> +                                          dpdk_multi_segment_mbufs ? &txconf
> +                                                                   : NULL);
>               if (diag) {
>                   VLOG_INFO("Interface %s txq(%d) setup error: %s",
>                             dev->up.name, i, rte_strerror(-diag));
> @@ -3287,6 +3323,12 @@ unlock:
>       return err;
>   }
>   
> +void
> +netdev_dpdk_multi_segment_mbufs_enable(void)
> +{
> +    dpdk_multi_segment_mbufs = true;
> +}
> +
>   #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>                             SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>                             GET_CARRIER, GET_STATS,             \
> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
> index b7d02a7..5b14c96 100644
> --- a/lib/netdev-dpdk.h
> +++ b/lib/netdev-dpdk.h
> @@ -26,6 +26,7 @@ struct dp_packet;
>   #ifdef DPDK_NETDEV
>   
>   void netdev_dpdk_register(void);
> +void netdev_dpdk_multi_segment_mbufs_enable(void);
>   void free_dpdk_buf(struct dp_packet *);
>   
>   #else
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index bb66cb5..fd8551e 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -283,6 +283,25 @@
>           </p>
>         </column>
>   
> +      <column name="other_config" key="dpdk-multi-seg-mbufs"
> +              type='{"type": "boolean"}'>
> +        <p>
> +          Specifies if DPDK uses multi-segment mbufs for handling jumbo frames.
> +        </p>
> +        <p>
> +            If true, DPDK allocates a single mempool for all ports, irrespective
> +            of the ports' requested MTU sizes. The elements of this mempool are
> +            'standard'-sized mbufs (typically 2k MB), which may be chained
> +            together to accommodate jumbo frames. In this approach, each mbuf
> +            typically stores a fragment of the overall jumbo frame.
> +        </p>
> +        <p>
> +            If not specified, defaults to <code>false</code>, in which case, the size
> +            of each mbuf within a DPDK port's mempool will be grown to accommodate
> +            jumbo frames within a single mbuf.
> +        </p>
> +      </column>
> +
>         <column name="other_config" key="dpdk-extra"
>                 type='{"type": "string"}'>
>           <p>

LGTM,

Acked-by: Michael Qiu <qiudayu@chinac.com>
Chandran, Sugesh May 26, 2017, 7:07 p.m. UTC | #2
Regards
_Sugesh


> -----Original Message-----
> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-
> bounces@openvswitch.org] On Behalf Of Mark Kavanagh
> Sent: Monday, May 15, 2017 11:17 AM
> To: ovs-dev@openvswitch.org; qiudayu@chinac.com
> Subject: [ovs-dev] [RFC PATCH v2 1/1] netdev-dpdk: enable multi-segment
> jumbo frames
> 
> Currently, jumbo frame support for OvS-DPDK is implemented by increasing
> the size of mbufs within a mempool, such that each mbuf within the pool is
> large enough to contain an entire jumbo frame of a user-defined size.
> Typically, for each user-defined MTU 'requested_mtu', a new mempool is
> created, containing mbufs of size ~requested_mtu.
> 
> With the multi-segment approach, all ports share the same mempool, in
> which each mbuf is of standard/default size (~2k MB). To accommodate
> jumbo frames, mbufs may be chained together, each mbuf storing a portion
> of the jumbo frame; each mbuf in the chain is termed a segment, hence the
> name.
> 
> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> 
> ---
> 
>  V2->V1: add missing 'signed-off-by' tag; no code changes
> 
>  lib/dpdk.c           |  6 ++++++
>  lib/netdev-dpdk.c    | 54
> ++++++++++++++++++++++++++++++++++++++++++++++------
>  lib/netdev-dpdk.h    |  1 +
>  vswitchd/vswitch.xml | 19 ++++++++++++++++++
>  4 files changed, 74 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 8da6c32..7d08e34 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -450,6 +450,12 @@ dpdk_init__(const struct smap *ovs_other_config)
> 
>      /* Finally, register the dpdk classes */
>      netdev_dpdk_register();
> +
> +    bool multi_seg_mbufs_enable = smap_get_bool(ovs_other_config,
> "dpdk-multi-seg-mbufs", false);
> +    if (multi_seg_mbufs_enable) {
> +        VLOG_INFO("DPDK multi-segment mbufs enabled\n");
> +        netdev_dpdk_multi_segment_mbufs_enable();
> +    }
>  }
> 
>  void
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 34fc54b..82bc0e2
> 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -58,6 +58,7 @@
> 
>  VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
> +static bool dpdk_multi_segment_mbufs = false;
> 
>  #define DPDK_PORT_WATCHDOG_INTERVAL 5
> 
> @@ -480,7 +481,7 @@ dpdk_mp_create(int socket_id, int mtu)
>       * when the number of ports and rxqs that utilize a particular mempool can
>       * change dynamically at runtime. For now, use this rough heurisitic.
>       */
> -    if (mtu >= ETHER_MTU) {
> +    if (mtu >= ETHER_MTU || dpdk_multi_segment_mbufs) {
>          mp_size = MAX_NB_MBUF;
>      } else {
>          mp_size = MIN_NB_MBUF;
> @@ -558,17 +559,33 @@ dpdk_mp_put(struct dpdk_mp *dmp)
>      ovs_mutex_unlock(&dpdk_mp_mutex);
>  }
> 
> -/* Tries to allocate new mempool on requested_socket_id with
> - * mbuf size corresponding to requested_mtu.
> +/* Tries to configure a mempool for 'dev' on requested socket_id to
> +accommodate
> + * packets of size 'requested_mtu'. The properties of the mempool's
> +elements
> + * are dependent on the value of 'dpdk_multi_segment_mbufs':
> + * - if 'true', then the mempool contains standard-sized mbufs that are
> chained
> + *   together to accommodate packets of size 'requested_mtu'. All ports on
> the
> + *   same socket will share this mempool, irrespective of their MTU.
> + * - if 'false', then a mempool is allocated, the members of which are non-
> + *   standard-sized mbufs. Each mbuf in the mempool is large enough to
> fully
> + *   accomdate packets of size 'requested_mtu'.
> + *
>   * On success new configuration will be applied.
>   * On error, device will be left unchanged. */  static int
> netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
>      OVS_REQUIRES(dev->mutex)
>  {
> -    uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
> +    uint32_t buf_size = 0;
>      struct dpdk_mp *mp;
> 
> +    /* Contiguous mbufs in use - permit oversized mbufs */
> +    if (!dpdk_multi_segment_mbufs) {
[Sugesh] Ok, So you are supporting mbuf allocation in both model for jumbo frames.
If chained mbuf is only way to implement the jumbo frames?
Looks to me there are some limitation as well on the single mbuf jumbo frames.


> +        buf_size = dpdk_buf_size(dev->requested_mtu);
> +    } else {
> +        /* multi-segment mbufs - use standard mbuf size */
> +        buf_size = dpdk_buf_size(ETHER_MTU);
> +    }
> +
>      mp = dpdk_mp_get(dev->requested_socket_id,
> FRAME_LEN_TO_MTU(buf_size));
>      if (!mp) {
>          VLOG_ERR("Failed to create memory pool for netdev "
> @@ -577,7 +594,13 @@ netdev_dpdk_mempool_configure(struct
> netdev_dpdk *dev)
>                   rte_strerror(rte_errno));
>          return rte_errno;
>      } else {
> -        dpdk_mp_put(dev->dpdk_mp);
> +        /* When single-segment mbufs are in use, a new mempool is allocated,
> +         * so release the old one. In the case of multi-segment mbufs, the
> +         * same mempool is used for all MTUs.
> +         */
> +        if (!dpdk_multi_segment_mbufs) {
> +            dpdk_mp_put(dev->dpdk_mp);
> +        }
>          dev->dpdk_mp = mp;
>          dev->mtu = dev->requested_mtu;
>          dev->socket_id = dev->requested_socket_id; @@ -639,6 +662,7 @@
> dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int
> n_txq)
>      int diag = 0;
>      int i;
>      struct rte_eth_conf conf = port_conf;
> +    struct rte_eth_txconf txconf;
> 
>      if (dev->mtu > ETHER_MTU) {
>          conf.rxmode.jumbo_frame = 1;
> @@ -666,9 +690,21 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk
> *dev, int n_rxq, int n_txq)
>              break;
>          }
> 
> +        /* DPDK PMDs typically attempt to use simple or vectorized
> +         * transmit functions, neither of which are compatible with
> +         * multi-segment mbufs. Ensure that these are disabled in the
> +         * multi-segment mbuf case.
> +         */
> +        if (dpdk_multi_segment_mbufs) {
> +            memset(&txconf, 0, sizeof(txconf));
> +            txconf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
[Sugesh] Technically you need to disable only if the port has MTU that is bigger than the mbuf size.
If MTU is smaller, use normal mbuf
Does it make sense to have this chained-mbuf support per port than global.


> +        }
> +
>          for (i = 0; i < n_txq; i++) {
>              diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
> -                                          dev->socket_id, NULL);
> +                                          dev->socket_id,
> +                                          dpdk_multi_segment_mbufs ? &txconf
> +                                                                   :
> + NULL);
>              if (diag) {
>                  VLOG_INFO("Interface %s txq(%d) setup error: %s",
>                            dev->up.name, i, rte_strerror(-diag)); @@ -3287,6 +3323,12
> @@ unlock:
>      return err;
>  }
> 
> +void
> +netdev_dpdk_multi_segment_mbufs_enable(void)
> +{
> +    dpdk_multi_segment_mbufs = true;
> +}
> +
>  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>                            GET_CARRIER, GET_STATS,             \
> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index b7d02a7..5b14c96
> 100644
> --- a/lib/netdev-dpdk.h
> +++ b/lib/netdev-dpdk.h
> @@ -26,6 +26,7 @@ struct dp_packet;
>  #ifdef DPDK_NETDEV
> 
>  void netdev_dpdk_register(void);
> +void netdev_dpdk_multi_segment_mbufs_enable(void);
>  void free_dpdk_buf(struct dp_packet *);
> 
>  #else
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index
> bb66cb5..fd8551e 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -283,6 +283,25 @@
>          </p>
>        </column>
> 
> +      <column name="other_config" key="dpdk-multi-seg-mbufs"
> +              type='{"type": "boolean"}'>
> +        <p>
> +          Specifies if DPDK uses multi-segment mbufs for handling jumbo
> frames.
> +        </p>
> +        <p>
> +            If true, DPDK allocates a single mempool for all ports, irrespective
> +            of the ports' requested MTU sizes. The elements of this mempool are
> +            'standard'-sized mbufs (typically 2k MB), which may be chained
> +            together to accommodate jumbo frames. In this approach, each mbuf
> +            typically stores a fragment of the overall jumbo frame.
> +        </p>
> +        <p>
> +            If not specified, defaults to <code>false</code>, in which case, the
> size
> +            of each mbuf within a DPDK port's mempool will be grown to
> accommodate
> +            jumbo frames within a single mbuf.
> +        </p>
> +      </column>
> +
>        <column name="other_config" key="dpdk-extra"
>                type='{"type": "string"}'>
>          <p>
> --
[Sugesh] 
> 1.9.3
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Mark Kavanagh May 31, 2017, 1:45 p.m. UTC | #3
>From: Chandran, Sugesh
>Sent: Friday, May 26, 2017 8:07 PM
>To: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; ovs-dev@openvswitch.org; qiudayu@chinac.com
>Subject: RE: [ovs-dev] [RFC PATCH v2 1/1] netdev-dpdk: enable multi-segment jumbo frames
>
>
>
>Regards
>_Sugesh


Thanks Sugesh - comments inline.

Cheers,
Mark

>
>
>> -----Original Message-----
>> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-
>> bounces@openvswitch.org] On Behalf Of Mark Kavanagh
>> Sent: Monday, May 15, 2017 11:17 AM
>> To: ovs-dev@openvswitch.org; qiudayu@chinac.com
>> Subject: [ovs-dev] [RFC PATCH v2 1/1] netdev-dpdk: enable multi-segment
>> jumbo frames
>>
>> Currently, jumbo frame support for OvS-DPDK is implemented by increasing
>> the size of mbufs within a mempool, such that each mbuf within the pool is
>> large enough to contain an entire jumbo frame of a user-defined size.
>> Typically, for each user-defined MTU 'requested_mtu', a new mempool is
>> created, containing mbufs of size ~requested_mtu.
>>
>> With the multi-segment approach, all ports share the same mempool, in
>> which each mbuf is of standard/default size (~2k MB). To accommodate
>> jumbo frames, mbufs may be chained together, each mbuf storing a portion
>> of the jumbo frame; each mbuf in the chain is termed a segment, hence the
>> name.
>>
>> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>>
>> ---
>>
>>  V2->V1: add missing 'signed-off-by' tag; no code changes
>>
>>  lib/dpdk.c           |  6 ++++++
>>  lib/netdev-dpdk.c    | 54
>> ++++++++++++++++++++++++++++++++++++++++++++++------
>>  lib/netdev-dpdk.h    |  1 +
>>  vswitchd/vswitch.xml | 19 ++++++++++++++++++
>>  4 files changed, 74 insertions(+), 6 deletions(-)
>>
>> diff --git a/lib/dpdk.c b/lib/dpdk.c
>> index 8da6c32..7d08e34 100644
>> --- a/lib/dpdk.c
>> +++ b/lib/dpdk.c
>> @@ -450,6 +450,12 @@ dpdk_init__(const struct smap *ovs_other_config)
>>
>>      /* Finally, register the dpdk classes */
>>      netdev_dpdk_register();
>> +
>> +    bool multi_seg_mbufs_enable = smap_get_bool(ovs_other_config,
>> "dpdk-multi-seg-mbufs", false);
>> +    if (multi_seg_mbufs_enable) {
>> +        VLOG_INFO("DPDK multi-segment mbufs enabled\n");
>> +        netdev_dpdk_multi_segment_mbufs_enable();
>> +    }
>>  }
>>
>>  void
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 34fc54b..82bc0e2
>> 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -58,6 +58,7 @@
>>
>>  VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
>>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
>> +static bool dpdk_multi_segment_mbufs = false;
>>
>>  #define DPDK_PORT_WATCHDOG_INTERVAL 5
>>
>> @@ -480,7 +481,7 @@ dpdk_mp_create(int socket_id, int mtu)
>>       * when the number of ports and rxqs that utilize a particular mempool can
>>       * change dynamically at runtime. For now, use this rough heurisitic.
>>       */
>> -    if (mtu >= ETHER_MTU) {
>> +    if (mtu >= ETHER_MTU || dpdk_multi_segment_mbufs) {
>>          mp_size = MAX_NB_MBUF;
>>      } else {
>>          mp_size = MIN_NB_MBUF;
>> @@ -558,17 +559,33 @@ dpdk_mp_put(struct dpdk_mp *dmp)
>>      ovs_mutex_unlock(&dpdk_mp_mutex);
>>  }
>>
>> -/* Tries to allocate new mempool on requested_socket_id with
>> - * mbuf size corresponding to requested_mtu.
>> +/* Tries to configure a mempool for 'dev' on requested socket_id to
>> +accommodate
>> + * packets of size 'requested_mtu'. The properties of the mempool's
>> +elements
>> + * are dependent on the value of 'dpdk_multi_segment_mbufs':
>> + * - if 'true', then the mempool contains standard-sized mbufs that are
>> chained
>> + *   together to accommodate packets of size 'requested_mtu'. All ports on
>> the
>> + *   same socket will share this mempool, irrespective of their MTU.
>> + * - if 'false', then a mempool is allocated, the members of which are non-
>> + *   standard-sized mbufs. Each mbuf in the mempool is large enough to
>> fully
>> + *   accomdate packets of size 'requested_mtu'.
>> + *
>>   * On success new configuration will be applied.
>>   * On error, device will be left unchanged. */  static int
>> netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
>>      OVS_REQUIRES(dev->mutex)
>>  {
>> -    uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
>> +    uint32_t buf_size = 0;
>>      struct dpdk_mp *mp;
>>
>> +    /* Contiguous mbufs in use - permit oversized mbufs */
>> +    if (!dpdk_multi_segment_mbufs) {
>[Sugesh] Ok, So you are supporting mbuf allocation in both model for jumbo frames.
>If chained mbuf is only way to implement the jumbo frames?

I'm not sure what's being asked here (sorry!) - could you rephrase please?

>Looks to me there are some limitation as well on the single mbuf jumbo frames.

You mean on the size of single-segment jumbo frames?
If so, then yes, there are restrictions on their size (for physical ports at least), which are NIC/PMD dependent.

>
>
>> +        buf_size = dpdk_buf_size(dev->requested_mtu);
>> +    } else {
>> +        /* multi-segment mbufs - use standard mbuf size */
>> +        buf_size = dpdk_buf_size(ETHER_MTU);
>> +    }
>> +
>>      mp = dpdk_mp_get(dev->requested_socket_id,
>> FRAME_LEN_TO_MTU(buf_size));
>>      if (!mp) {
>>          VLOG_ERR("Failed to create memory pool for netdev "
>> @@ -577,7 +594,13 @@ netdev_dpdk_mempool_configure(struct
>> netdev_dpdk *dev)
>>                   rte_strerror(rte_errno));
>>          return rte_errno;
>>      } else {
>> -        dpdk_mp_put(dev->dpdk_mp);
>> +        /* When single-segment mbufs are in use, a new mempool is allocated,
>> +         * so release the old one. In the case of multi-segment mbufs, the
>> +         * same mempool is used for all MTUs.
>> +         */
>> +        if (!dpdk_multi_segment_mbufs) {
>> +            dpdk_mp_put(dev->dpdk_mp);
>> +        }
>>          dev->dpdk_mp = mp;
>>          dev->mtu = dev->requested_mtu;
>>          dev->socket_id = dev->requested_socket_id; @@ -639,6 +662,7 @@
>> dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int
>> n_txq)
>>      int diag = 0;
>>      int i;
>>      struct rte_eth_conf conf = port_conf;
>> +    struct rte_eth_txconf txconf;
>>
>>      if (dev->mtu > ETHER_MTU) {
>>          conf.rxmode.jumbo_frame = 1;
>> @@ -666,9 +690,21 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk
>> *dev, int n_rxq, int n_txq)
>>              break;
>>          }
>>
>> +        /* DPDK PMDs typically attempt to use simple or vectorized
>> +         * transmit functions, neither of which are compatible with
>> +         * multi-segment mbufs. Ensure that these are disabled in the
>> +         * multi-segment mbuf case.
>> +         */
>> +        if (dpdk_multi_segment_mbufs) {
>> +            memset(&txconf, 0, sizeof(txconf));
>> +            txconf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
>[Sugesh] Technically you need to disable only if the port has MTU that is bigger than the
>mbuf size.
>If MTU is smaller, use normal mbuf
>Does it make sense to have this chained-mbuf support per port than global.

That was something that I considered initially, but ultimately, I came to the conclusion that this would add additional complexity and overhead to the fastpath which could adversely impact performance.
Is there a strong case for mixed-mode jumbo frames on disparate ports on the same bridge?

>
>
>> +        }
>> +
>>          for (i = 0; i < n_txq; i++) {
>>              diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
>> -                                          dev->socket_id, NULL);
>> +                                          dev->socket_id,
>> +                                          dpdk_multi_segment_mbufs ? &txconf
>> +                                                                   :
>> + NULL);
>>              if (diag) {
>>                  VLOG_INFO("Interface %s txq(%d) setup error: %s",
>>                            dev->up.name, i, rte_strerror(-diag)); @@ -3287,6 +3323,12
>> @@ unlock:
>>      return err;
>>  }
>>
>> +void
>> +netdev_dpdk_multi_segment_mbufs_enable(void)
>> +{
>> +    dpdk_multi_segment_mbufs = true;
>> +}
>> +
>>  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>>                            GET_CARRIER, GET_STATS,             \
>> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index b7d02a7..5b14c96
>> 100644
>> --- a/lib/netdev-dpdk.h
>> +++ b/lib/netdev-dpdk.h
>> @@ -26,6 +26,7 @@ struct dp_packet;
>>  #ifdef DPDK_NETDEV
>>
>>  void netdev_dpdk_register(void);
>> +void netdev_dpdk_multi_segment_mbufs_enable(void);
>>  void free_dpdk_buf(struct dp_packet *);
>>
>>  #else
>> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index
>> bb66cb5..fd8551e 100644
>> --- a/vswitchd/vswitch.xml
>> +++ b/vswitchd/vswitch.xml
>> @@ -283,6 +283,25 @@
>>          </p>
>>        </column>
>>
>> +      <column name="other_config" key="dpdk-multi-seg-mbufs"
>> +              type='{"type": "boolean"}'>
>> +        <p>
>> +          Specifies if DPDK uses multi-segment mbufs for handling jumbo
>> frames.
>> +        </p>
>> +        <p>
>> +            If true, DPDK allocates a single mempool for all ports, irrespective
>> +            of the ports' requested MTU sizes. The elements of this mempool are
>> +            'standard'-sized mbufs (typically 2k MB), which may be chained
>> +            together to accommodate jumbo frames. In this approach, each mbuf
>> +            typically stores a fragment of the overall jumbo frame.
>> +        </p>
>> +        <p>
>> +            If not specified, defaults to <code>false</code>, in which case, the
>> size
>> +            of each mbuf within a DPDK port's mempool will be grown to
>> accommodate
>> +            jumbo frames within a single mbuf.
>> +        </p>
>> +      </column>
>> +
>>        <column name="other_config" key="dpdk-extra"
>>                type='{"type": "string"}'>
>>          <p>
>> --
>[Sugesh]
>> 1.9.3
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
diff mbox

Patch

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 8da6c32..7d08e34 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -450,6 +450,12 @@  dpdk_init__(const struct smap *ovs_other_config)
 
     /* Finally, register the dpdk classes */
     netdev_dpdk_register();
+
+    bool multi_seg_mbufs_enable = smap_get_bool(ovs_other_config, "dpdk-multi-seg-mbufs", false);
+    if (multi_seg_mbufs_enable) {
+        VLOG_INFO("DPDK multi-segment mbufs enabled\n");
+        netdev_dpdk_multi_segment_mbufs_enable();
+    }
 }
 
 void
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 34fc54b..82bc0e2 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -58,6 +58,7 @@ 
 
 VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+static bool dpdk_multi_segment_mbufs = false;
 
 #define DPDK_PORT_WATCHDOG_INTERVAL 5
 
@@ -480,7 +481,7 @@  dpdk_mp_create(int socket_id, int mtu)
      * when the number of ports and rxqs that utilize a particular mempool can
      * change dynamically at runtime. For now, use this rough heurisitic.
      */
-    if (mtu >= ETHER_MTU) {
+    if (mtu >= ETHER_MTU || dpdk_multi_segment_mbufs) {
         mp_size = MAX_NB_MBUF;
     } else {
         mp_size = MIN_NB_MBUF;
@@ -558,17 +559,33 @@  dpdk_mp_put(struct dpdk_mp *dmp)
     ovs_mutex_unlock(&dpdk_mp_mutex);
 }
 
-/* Tries to allocate new mempool on requested_socket_id with
- * mbuf size corresponding to requested_mtu.
+/* Tries to configure a mempool for 'dev' on requested socket_id to accommodate
+ * packets of size 'requested_mtu'. The properties of the mempool's elements
+ * are dependent on the value of 'dpdk_multi_segment_mbufs':
+ * - if 'true', then the mempool contains standard-sized mbufs that are chained
+ *   together to accommodate packets of size 'requested_mtu'. All ports on the
+ *   same socket will share this mempool, irrespective of their MTU.
+ * - if 'false', then a mempool is allocated, the members of which are non-
+ *   standard-sized mbufs. Each mbuf in the mempool is large enough to fully
+ *   accomdate packets of size 'requested_mtu'.
+ *
  * On success new configuration will be applied.
  * On error, device will be left unchanged. */
 static int
 netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
     OVS_REQUIRES(dev->mutex)
 {
-    uint32_t buf_size = dpdk_buf_size(dev->requested_mtu);
+    uint32_t buf_size = 0;
     struct dpdk_mp *mp;
 
+    /* Contiguous mbufs in use - permit oversized mbufs */
+    if (!dpdk_multi_segment_mbufs) {
+        buf_size = dpdk_buf_size(dev->requested_mtu);
+    } else {
+        /* multi-segment mbufs - use standard mbuf size */
+        buf_size = dpdk_buf_size(ETHER_MTU);
+    }
+
     mp = dpdk_mp_get(dev->requested_socket_id, FRAME_LEN_TO_MTU(buf_size));
     if (!mp) {
         VLOG_ERR("Failed to create memory pool for netdev "
@@ -577,7 +594,13 @@  netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
                  rte_strerror(rte_errno));
         return rte_errno;
     } else {
-        dpdk_mp_put(dev->dpdk_mp);
+        /* When single-segment mbufs are in use, a new mempool is allocated,
+         * so release the old one. In the case of multi-segment mbufs, the
+         * same mempool is used for all MTUs.
+         */
+        if (!dpdk_multi_segment_mbufs) {
+            dpdk_mp_put(dev->dpdk_mp);
+        }
         dev->dpdk_mp = mp;
         dev->mtu = dev->requested_mtu;
         dev->socket_id = dev->requested_socket_id;
@@ -639,6 +662,7 @@  dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
     int diag = 0;
     int i;
     struct rte_eth_conf conf = port_conf;
+    struct rte_eth_txconf txconf;
 
     if (dev->mtu > ETHER_MTU) {
         conf.rxmode.jumbo_frame = 1;
@@ -666,9 +690,21 @@  dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
             break;
         }
 
+        /* DPDK PMDs typically attempt to use simple or vectorized
+         * transmit functions, neither of which are compatible with
+         * multi-segment mbufs. Ensure that these are disabled in the
+         * multi-segment mbuf case.
+         */
+        if (dpdk_multi_segment_mbufs) {
+            memset(&txconf, 0, sizeof(txconf));
+            txconf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
+        }
+
         for (i = 0; i < n_txq; i++) {
             diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
-                                          dev->socket_id, NULL);
+                                          dev->socket_id,
+                                          dpdk_multi_segment_mbufs ? &txconf
+                                                                   : NULL);
             if (diag) {
                 VLOG_INFO("Interface %s txq(%d) setup error: %s",
                           dev->up.name, i, rte_strerror(-diag));
@@ -3287,6 +3323,12 @@  unlock:
     return err;
 }
 
+void
+netdev_dpdk_multi_segment_mbufs_enable(void)
+{
+    dpdk_multi_segment_mbufs = true;
+}
+
 #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
                           SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                           GET_CARRIER, GET_STATS,             \
diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h
index b7d02a7..5b14c96 100644
--- a/lib/netdev-dpdk.h
+++ b/lib/netdev-dpdk.h
@@ -26,6 +26,7 @@  struct dp_packet;
 #ifdef DPDK_NETDEV
 
 void netdev_dpdk_register(void);
+void netdev_dpdk_multi_segment_mbufs_enable(void);
 void free_dpdk_buf(struct dp_packet *);
 
 #else
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index bb66cb5..fd8551e 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -283,6 +283,25 @@ 
         </p>
       </column>
 
+      <column name="other_config" key="dpdk-multi-seg-mbufs"
+              type='{"type": "boolean"}'>
+        <p>
+          Specifies if DPDK uses multi-segment mbufs for handling jumbo frames.
+        </p>
+        <p>
+            If true, DPDK allocates a single mempool for all ports, irrespective
+            of the ports' requested MTU sizes. The elements of this mempool are
+            'standard'-sized mbufs (typically 2k MB), which may be chained
+            together to accommodate jumbo frames. In this approach, each mbuf
+            typically stores a fragment of the overall jumbo frame.
+        </p>
+        <p>
+            If not specified, defaults to <code>false</code>, in which case, the size
+            of each mbuf within a DPDK port's mempool will be grown to accommodate
+            jumbo frames within a single mbuf.
+        </p>
+      </column>
+
       <column name="other_config" key="dpdk-extra"
               type='{"type": "string"}'>
         <p>