@@ -71,3 +71,54 @@ Jumbo frame support has been validated against 9728B frames, which is the
largest frame size supported by Fortville NIC using the DPDK i40e driver, but
larger frames and other DPDK NIC drivers may be supported. These cases are
common for use cases involving East-West traffic only.
+
+-------------------
+Multi-segment mbufs
+-------------------
+
+Instead of increasing the size of mbufs within a mempool, such that each mbuf
+within the pool is large enough to contain an entire jumbo frame of a
+user-defined size, mbufs can be chained together instead. In this approach each
+mbuf in the chain stores a portion of the jumbo frame, by default ~2K bytes,
+irrespective of the user-requested MTU value. Since each mbuf in the chain is
+termed a segment, this approach is named "multi-segment mbufs".
+
+This approach may bring more flexibility in use cases where the maximum packet
+length may be hard to guess. For example, in cases where packets originate from
+sources marked for oflload (such as TSO), each packet may be larger than the
+MTU, and as such, when forwarding it to a DPDK port a single mbuf may not be
+enough to hold all of the packet's data.
+
+Multi-segment and single-segment mbufs are mutually exclusive, and the user
+must decide on which approach to adopt on initialisation. If multi-segment
+mbufs is to be enabled, it can be done so with the following command::
+
+ $ ovs-vsctl set Open_vSwitch . other_config:dpdk-multi-seg-mbufs=true
+
+Single-segment mbufs still remain the default when using OvS-DPDK, and the
+above option `dpdk-multi-seg-mbufs` must be explicitly set to `true` if
+multi-segment mbufs are to be used.
+
+~~~~~~~~~~~~~~~~~
+Performance notes
+~~~~~~~~~~~~~~~~~
+
+When using multi-segment mbufs some PMDs may not support vectorized Tx
+functions, due to its non-contiguous nature. As a result this can hit
+performance for smaller packet sizes. For example, on a setup sending 64B
+packets at line rate, a decrease of ~20% has been observed. The performance
+impact stops being noticeable for larger packet sizes, although the exact size
+will between PMDs, and depending on the architecture one's using.
+
+Tests performed with the i40e PMD driver only showed this limitation for 64B
+packets, and the same rate was observed when comparing multi-segment mbufs and
+single-segment mbuf for 128B packets. In other words, the 20% drop in
+performance was not observed for packets >= 128B during this test case.
+
+Because of this, multi-segment mbufs is not advised to be used with smaller
+packet sizes, such as 64B.
+
+Also, note that using multi-segment mbufs won't improve memory usage. For a
+packet of 9000B, for example, which would be stored on a single mbuf when using
+the single-segment approach, 5 mbufs (9000/2048) of 2048B would be needed to
+store the same data using the multi-segment mbufs approach.
@@ -111,6 +111,7 @@ v2.9.0 - 19 Feb 2018
pmd assignments.
* Add rxq utilization of pmd to appctl 'dpif-netdev/pmd-rxq-show'.
* Add support for vHost dequeue zero copy (experimental)
+ * Add support for multi-segment mbufs
- Userspace datapath:
* Output packet batching support.
- vswitchd:
@@ -491,6 +491,14 @@ dpdk_init__(const struct smap *ovs_other_config)
/* Finally, register the dpdk classes */
netdev_dpdk_register();
+
+ bool multi_seg_mbufs_enable = smap_get_bool(ovs_other_config,
+ "dpdk-multi-seg-mbufs", false);
+ if (multi_seg_mbufs_enable) {
+ VLOG_INFO("DPDK multi-segment mbufs enabled\n");
+ netdev_dpdk_multi_segment_mbufs_enable();
+ }
+
return true;
}
@@ -66,6 +66,7 @@ enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
VLOG_DEFINE_THIS_MODULE(netdev_dpdk);
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+static bool dpdk_multi_segment_mbufs = false;
#define DPDK_PORT_WATCHDOG_INTERVAL 5
@@ -628,7 +629,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, uint16_t mbuf_pkt_data_len)
uint32_t n_mbufs;
uint32_t hash = hash_string(netdev_name, 0);
struct rte_mempool *mp = NULL;
- uint16_t mbuf_size, aligned_mbuf_size, mbuf_priv_data_len;
+ uint16_t mbuf_size, aligned_mbuf_size, mbuf_priv_data_len, max_frame_len;
/*
* XXX: rough estimation of number of mbufs required for this port:
@@ -642,6 +643,21 @@ dpdk_mp_create(struct netdev_dpdk *dev, uint16_t mbuf_pkt_data_len)
+ MIN(RTE_MAX_LCORE, dev->requested_n_rxq) * NETDEV_MAX_BURST
+ MIN_NB_MBUF;
+ /* If multi-segment mbufs are used, we also increase the number of mbufs
+ * used. This is done by calculating how many mbufs are needed to hold the
+ * data on a single packet of MTU size. For example, for a received packet
+ * of 9000B, 5 mbufs (9000 / 2048) are needed to hold the data - 4 more
+ * than with single-mbufs (as mbufs' size is extended to hold all data) */
+ max_frame_len = MTU_TO_MAX_FRAME_LEN(dev->requested_mtu);
+ if (dpdk_multi_segment_mbufs && mbuf_pkt_data_len < max_frame_len) {
+ uint16_t nb_segs = max_frame_len / mbuf_pkt_data_len;
+ if (max_frame_len % mbuf_pkt_data_len) {
+ nb_segs += 1;
+ }
+
+ n_mbufs *= nb_segs;
+ }
+
ovs_mutex_lock(&dpdk_mp_mutex);
do {
/* Full DPDK memory pool name must be unique and cannot be
@@ -743,7 +759,13 @@ dpdk_mp_release(struct rte_mempool *mp)
/* Tries to allocate a new mempool - or re-use an existing one where
* appropriate - on requested_socket_id with a size determined by
- * requested_mtu and requested Rx/Tx queues.
+ * requested_mtu and requested Rx/Tx queues. Some properties of the mempool's
+ * elements are dependent on the value of 'dpdk_multi_segment_mbufs':
+ * - if 'true', then the mempool contains standard-sized mbufs that are chained
+ * together to accommodate packets of size 'requested_mtu'.
+ * - if 'false', then the members of the allocated mempool are
+ * non-standard-sized mbufs. Each mbuf in the mempool is large enough to
+ * fully accomdate packets of size 'requested_mtu'.
* On success - or when re-using an existing mempool - the new configuration
* will be applied.
* On error, device will be left unchanged. */
@@ -751,10 +773,18 @@ static int
netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
OVS_REQUIRES(dev->mutex)
{
- uint16_t buf_size = dpdk_buf_size(dev->requested_mtu);
+ uint16_t buf_size = 0;
struct rte_mempool *mp;
int ret = 0;
+ /* Contiguous mbufs in use - permit oversized mbufs */
+ if (!dpdk_multi_segment_mbufs) {
+ buf_size = dpdk_buf_size(dev->requested_mtu);
+ } else {
+ /* multi-segment mbufs - use standard mbuf size */
+ buf_size = dpdk_buf_size(ETHER_MTU);
+ }
+
dpdk_mp_sweep();
mp = dpdk_mp_create(dev, buf_size);
@@ -836,6 +866,7 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
int diag = 0;
int i;
struct rte_eth_conf conf = port_conf;
+ struct rte_eth_txconf txconf;
struct rte_eth_dev_info info;
uint16_t conf_mtu;
@@ -852,6 +883,18 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
}
}
+ /* Multi-segment-mbuf-specific setup. */
+ if (dpdk_multi_segment_mbufs) {
+ /* DPDK PMDs typically attempt to use simple or vectorized
+ * transmit functions, neither of which are compatible with
+ * multi-segment mbufs. Ensure that these are disabled when
+ * multi-segment mbufs are enabled.
+ */
+ rte_eth_dev_info_get(dev->port_id, &info);
+ txconf = info.default_txconf;
+ txconf.txq_flags &= ~ETH_TXQ_FLAGS_NOMULTSEGS;
+ }
+
conf.intr_conf.lsc = dev->lsc_interrupt_mode;
conf.rxmode.hw_ip_checksum = (dev->hw_ol_features &
NETDEV_RX_CHECKSUM_OFFLOAD) != 0;
@@ -896,7 +939,9 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
for (i = 0; i < n_txq; i++) {
diag = rte_eth_tx_queue_setup(dev->port_id, i, dev->txq_size,
- dev->socket_id, NULL);
+ dev->socket_id,
+ dpdk_multi_segment_mbufs ? &txconf
+ : NULL);
if (diag) {
VLOG_INFO("Interface %s unable to setup txq(%d): %s",
dev->up.name, i, rte_strerror(-diag));
@@ -3985,6 +4030,18 @@ unlock:
return err;
}
+bool
+netdev_dpdk_is_multi_segment_mbufs_enabled(void)
+{
+ return dpdk_multi_segment_mbufs == true;
+}
+
+void
+netdev_dpdk_multi_segment_mbufs_enable(void)
+{
+ dpdk_multi_segment_mbufs = true;
+}
+
#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \
SET_CONFIG, SET_TX_MULTIQ, SEND, \
GET_CARRIER, GET_STATS, \
@@ -25,6 +25,8 @@ struct dp_packet;
#ifdef DPDK_NETDEV
+bool netdev_dpdk_is_multi_segment_mbufs_enabled(void);
+void netdev_dpdk_multi_segment_mbufs_enable(void);
void netdev_dpdk_register(void);
void free_dpdk_buf(struct dp_packet *);
@@ -332,6 +332,29 @@
</p>
</column>
+ <column name="other_config" key="dpdk-multi-seg-mbufs"
+ type='{"type": "boolean"}'>
+ <p>
+ Specifies if DPDK uses multi-segment mbufs for handling jumbo frames.
+ </p>
+ <p>
+ If true, DPDK allocates a single mempool per port, irrespective of
+ the ports' requested MTU sizes. The elements of this mempool are
+ 'standard'-sized mbufs (typically 2k MB), which may be chained
+ together to accommodate jumbo frames. In this approach, each mbuf
+ typically stores a fragment of the overall jumbo frame.
+ </p>
+ <p>
+ If not specified, defaults to <code>false</code>, in which case, the
+ size of each mbuf within a DPDK port's mempool will be grown to
+ accommodate jumbo frames within a single mbuf.
+ </p>
+ <p>
+ Changing this value requires restarting the daemon.
+ </p>
+ </column>
+
+
<column name="other_config" key="vhost-sock-dir"
type='{"type": "string"}'>
<p>