Message ID | 1527094048-105084-2-git-send-email-tiago.lam@intel.com |
---|---|
State | Superseded |
Headers | show |
Series | Support multi-segment mbufs | expand |
> > From: Mark Kavanagh <mark.b.kavanagh@intel.com> > > There are numerous factors that must be considered when calculating > the size of an mbuf: > - the data portion of the mbuf must be sized in accordance With Rx > buffer alignment (typically 1024B). So, for example, in order to > successfully receive and capture a 1500B packet, mbufs with a > data portion of size 2048B must be used. > - in OvS, the elements that comprise an mbuf are: > * the dp packet, which includes a struct rte mbuf (704B) > * RTE_PKTMBUF_HEADROOM (128B) > * packet data (aligned to 1k, as previously described) > * RTE_PKTMBUF_TAILROOM (typically 0) > > Some PMDs require that the total mbuf size (i.e. the total sum of all > of the above-listed components' lengths) is cache-aligned. To satisfy > this requirement, it may be necessary to round up the total mbuf size > with respect to cacheline size. In doing so, it's possible that the > dp_packet's data portion is inadvertently increased in size, such that > it no longer adheres to Rx buffer alignment. Consequently, the > following property of the mbuf no longer holds true: > > mbuf.data_len == mbuf.buf_len - mbuf.data_off > > This creates a problem in the case of multi-segment mbufs, where that > assumption is assumed to be true for all but the final segment in an > mbuf chain. Resolve this issue by adjusting the size of the mbuf's > private data portion, as opposed to the packet data portion when > aligning mbuf size to cachelines. > > Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization") > Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size") > CC: Santosh Shukla <santosh.shukla@caviumnetworks.com> > Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> > --- > lib/netdev-dpdk.c | 46 ++++++++++++++++++++++++++++++---------------- > 1 file changed, 30 insertions(+), 16 deletions(-) > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 87152a7..356a967 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -82,12 +82,6 @@ static struct vlog_rate_limit rl = > VLOG_RATE_LIMIT_INIT(5, 20); > + (2 * VLAN_HEADER_LEN)) > #define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + > ETHER_CRC_LEN) > #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + > ETHER_HDR_MAX_LEN) > -#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \ > - - ETHER_HDR_LEN - ETHER_CRC_LEN) > -#define MBUF_SIZE(mtu) > ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \ > - + sizeof(struct dp_packet) \ > - + RTE_PKTMBUF_HEADROOM), \ > - RTE_CACHE_LINE_SIZE) > #define NETDEV_DPDK_MBUF_ALIGN 1024 > #define NETDEV_DPDK_MAX_PKT_LEN 9728 > > @@ -491,7 +485,7 @@ is_dpdk_class(const struct netdev_class *class) > * behaviour, which reduces performance. To prevent this, use a buffer size > * that is closest to 'mtu', but which satisfies the aforementioned criteria. > */ > -static uint32_t > +static uint16_t > dpdk_buf_size(int mtu) > { > return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + > RTE_PKTMBUF_HEADROOM), > @@ -582,7 +576,7 @@ dpdk_mp_do_not_free(struct rte_mempool *mp) > OVS_REQUIRES(dpdk_mp_mutex) > * - a new mempool was just created; > * - a matching mempool already exists. */ > static struct rte_mempool * > -dpdk_mp_create(struct netdev_dpdk *dev, int mtu) > +dpdk_mp_create(struct netdev_dpdk *dev, uint16_t mbuf_pkt_data_len) > { > char mp_name[RTE_MEMPOOL_NAMESIZE]; > const char *netdev_name = netdev_get_name(&dev->up); > @@ -590,6 +584,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu) > uint32_t n_mbufs; > uint32_t hash = hash_string(netdev_name, 0); > struct rte_mempool *mp = NULL; > + uint16_t mbuf_size, aligned_mbuf_size, mbuf_priv_data_len; > > /* > * XXX: rough estimation of number of mbufs required for this port: > @@ -609,12 +604,13 @@ dpdk_mp_create(struct netdev_dpdk *dev, int > mtu) > * longer than RTE_MEMPOOL_NAMESIZE. */ > int ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, > "ovs%08x%02d%05d%07u", > - hash, socket_id, mtu, n_mbufs); > + hash, socket_id, mbuf_pkt_data_len, n_mbufs); %u for printing the unsigned mbuf_pkt_data_len > if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) { > VLOG_DBG("snprintf returned %d. " > "Failed to generate a mempool name for \"%s\". " > - "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.", > - ret, netdev_name, hash, socket_id, mtu, n_mbufs); > + "Hash:0x%x, socket_id: %d, pkt data room:%d, mbufs:%u.", > + ret, netdev_name, hash, socket_id, mbuf_pkt_data_len, > + n_mbufs); Same here > break; > } > > @@ -623,13 +619,31 @@ dpdk_mp_create(struct netdev_dpdk *dev, int > mtu) > netdev_name, n_mbufs, socket_id, > dev->requested_n_rxq, dev->requested_n_txq); > > + mbuf_priv_data_len = sizeof(struct dp_packet) - > + sizeof(struct rte_mbuf); > + /* The size of the entire mbuf. */ > + mbuf_size = sizeof (struct dp_packet) + > + mbuf_pkt_data_len + RTE_PKTMBUF_HEADROOM; > + /* mbuf size, rounded up to cacheline size. */ > + aligned_mbuf_size = ROUND_UP(mbuf_size, RTE_CACHE_LINE_SIZE); > + /* If there is a size discrepancy, add padding to mbuf_priv_data_len. > + * This maintains mbuf size cache alignment, while also honoring RX > + * buffer alignment in the data portion of the mbuf. If this adjustment > + * is not made, there is a possiblity later on that for an element of > + * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off). > + * This is problematic in the case of multi-segment mbufs, particularly > + * when an mbuf segment needs to be resized (when [push|popp]ing a > VLAN > + * header, for example. > + */ > + mbuf_priv_data_len += (aligned_mbuf_size - mbuf_size); > + > mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ, > - sizeof (struct dp_packet) - sizeof (struct rte_mbuf), > - MBUF_SIZE(mtu) - sizeof(struct dp_packet), socket_id); > + mbuf_priv_data_len, > + mbuf_pkt_data_len + RTE_PKTMBUF_HEADROOM, socket_id); > > if (mp) { > VLOG_DBG("Allocated \"%s\" mempool with %u mbufs", > - mp_name, n_mbufs); > + mp_name, n_mbufs); > /* rte_pktmbuf_pool_create has done some initialization of the > * rte_mbuf part of each dp_packet. Some OvS specific fields > * of the packet still need to be initialized by > @@ -690,13 +704,13 @@ static int > netdev_dpdk_mempool_configure(struct netdev_dpdk *dev) > OVS_REQUIRES(dev->mutex) > { > - uint32_t buf_size = dpdk_buf_size(dev->requested_mtu); > + uint16_t buf_size = dpdk_buf_size(dev->requested_mtu); > struct rte_mempool *mp; > int ret = 0; > > dpdk_mp_sweep(); > > - mp = dpdk_mp_create(dev, FRAME_LEN_TO_MTU(buf_size)); > + mp = dpdk_mp_create(dev, buf_size); > if (!mp) { > VLOG_ERR("Failed to create memory pool for netdev " > "%s, with MTU %d on socket %d: %s\n", > -- > 2.7.4 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 87152a7..356a967 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -82,12 +82,6 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + (2 * VLAN_HEADER_LEN)) #define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN + ETHER_CRC_LEN) #define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN) -#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \ - - ETHER_HDR_LEN - ETHER_CRC_LEN) -#define MBUF_SIZE(mtu) ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \ - + sizeof(struct dp_packet) \ - + RTE_PKTMBUF_HEADROOM), \ - RTE_CACHE_LINE_SIZE) #define NETDEV_DPDK_MBUF_ALIGN 1024 #define NETDEV_DPDK_MAX_PKT_LEN 9728 @@ -491,7 +485,7 @@ is_dpdk_class(const struct netdev_class *class) * behaviour, which reduces performance. To prevent this, use a buffer size * that is closest to 'mtu', but which satisfies the aforementioned criteria. */ -static uint32_t +static uint16_t dpdk_buf_size(int mtu) { return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) + RTE_PKTMBUF_HEADROOM), @@ -582,7 +576,7 @@ dpdk_mp_do_not_free(struct rte_mempool *mp) OVS_REQUIRES(dpdk_mp_mutex) * - a new mempool was just created; * - a matching mempool already exists. */ static struct rte_mempool * -dpdk_mp_create(struct netdev_dpdk *dev, int mtu) +dpdk_mp_create(struct netdev_dpdk *dev, uint16_t mbuf_pkt_data_len) { char mp_name[RTE_MEMPOOL_NAMESIZE]; const char *netdev_name = netdev_get_name(&dev->up); @@ -590,6 +584,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu) uint32_t n_mbufs; uint32_t hash = hash_string(netdev_name, 0); struct rte_mempool *mp = NULL; + uint16_t mbuf_size, aligned_mbuf_size, mbuf_priv_data_len; /* * XXX: rough estimation of number of mbufs required for this port: @@ -609,12 +604,13 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu) * longer than RTE_MEMPOOL_NAMESIZE. */ int ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE, "ovs%08x%02d%05d%07u", - hash, socket_id, mtu, n_mbufs); + hash, socket_id, mbuf_pkt_data_len, n_mbufs); if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) { VLOG_DBG("snprintf returned %d. " "Failed to generate a mempool name for \"%s\". " - "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.", - ret, netdev_name, hash, socket_id, mtu, n_mbufs); + "Hash:0x%x, socket_id: %d, pkt data room:%d, mbufs:%u.", + ret, netdev_name, hash, socket_id, mbuf_pkt_data_len, + n_mbufs); break; } @@ -623,13 +619,31 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu) netdev_name, n_mbufs, socket_id, dev->requested_n_rxq, dev->requested_n_txq); + mbuf_priv_data_len = sizeof(struct dp_packet) - + sizeof(struct rte_mbuf); + /* The size of the entire mbuf. */ + mbuf_size = sizeof (struct dp_packet) + + mbuf_pkt_data_len + RTE_PKTMBUF_HEADROOM; + /* mbuf size, rounded up to cacheline size. */ + aligned_mbuf_size = ROUND_UP(mbuf_size, RTE_CACHE_LINE_SIZE); + /* If there is a size discrepancy, add padding to mbuf_priv_data_len. + * This maintains mbuf size cache alignment, while also honoring RX + * buffer alignment in the data portion of the mbuf. If this adjustment + * is not made, there is a possiblity later on that for an element of + * the mempool, buf, buf->data_len < (buf->buf_len - buf->data_off). + * This is problematic in the case of multi-segment mbufs, particularly + * when an mbuf segment needs to be resized (when [push|popp]ing a VLAN + * header, for example. + */ + mbuf_priv_data_len += (aligned_mbuf_size - mbuf_size); + mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ, - sizeof (struct dp_packet) - sizeof (struct rte_mbuf), - MBUF_SIZE(mtu) - sizeof(struct dp_packet), socket_id); + mbuf_priv_data_len, + mbuf_pkt_data_len + RTE_PKTMBUF_HEADROOM, socket_id); if (mp) { VLOG_DBG("Allocated \"%s\" mempool with %u mbufs", - mp_name, n_mbufs); + mp_name, n_mbufs); /* rte_pktmbuf_pool_create has done some initialization of the * rte_mbuf part of each dp_packet. Some OvS specific fields * of the packet still need to be initialized by @@ -690,13 +704,13 @@ static int netdev_dpdk_mempool_configure(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) { - uint32_t buf_size = dpdk_buf_size(dev->requested_mtu); + uint16_t buf_size = dpdk_buf_size(dev->requested_mtu); struct rte_mempool *mp; int ret = 0; dpdk_mp_sweep(); - mp = dpdk_mp_create(dev, FRAME_LEN_TO_MTU(buf_size)); + mp = dpdk_mp_create(dev, buf_size); if (!mp) { VLOG_ERR("Failed to create memory pool for netdev " "%s, with MTU %d on socket %d: %s\n",