Message ID | 1528734090-220990-13-git-send-email-tiago.lam@intel.com |
---|---|
State | Superseded |
Headers | show |
Series | Support multi-segment mbufs | expand |
On 11 Jun 2018, at 18:21, Tiago Lam wrote: > From: Mark Kavanagh <mark.b.kavanagh@intel.com> > > Currently, packets are only copied to a single segment in the function > dpdk_do_tx_copy(). This could be an issue in the case of jumbo frames, > particularly when multi-segment mbufs are involved. > > This patch calculates the number of segments needed by a packet and > copies the data to each segment. > > A new function, dpdk_buf_alloc(), has also been introduced as a > wrapper > around the nonpmd_mp_mutex to serialise allocations from a non-pmd > context. > > Co-authored-by: Michael Qiu <qiudayu@chinac.com> > Co-authored-by: Tiago Lam <tiago.lam@intel.com> > > Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> > Signed-off-by: Michael Qiu <qiudayu@chinac.com> > Signed-off-by: Tiago Lam <tiago.lam@intel.com> > --- > lib/netdev-dpdk.c | 94 > +++++++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 84 insertions(+), 10 deletions(-) > > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 9b1fb9a..0079e28 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -515,6 +515,22 @@ dpdk_rte_mzalloc(size_t sz) > return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE); > } > > +static struct rte_mbuf * > +dpdk_buf_alloc(struct rte_mempool *mp) > +{ > + if (!dpdk_thread_is_pmd()) { > + ovs_mutex_lock(&nonpmd_mp_mutex); Can you explain why the lock is needed here for non PMD threads? > + } > + > + struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mp); > + > + if (!dpdk_thread_is_pmd()) { > + ovs_mutex_unlock(&nonpmd_mp_mutex); > + } > + > + return mbuf; > +} > + > void > free_dpdk_buf(struct dp_packet *p) > { > @@ -2167,6 +2183,71 @@ out: > } > } > > +static int a bool, true for success might be more usefull here. > +dpdk_prep_tx_buf(struct dp_packet *packet, struct rte_mbuf **head, > + struct rte_mempool *mp) Maybe the function name should be more what it's doing, i.e dpdk_clone_dp_packet_to_mbuf(). > +{ > + struct rte_mbuf *temp; > + uint32_t size = dp_packet_size(packet); > + uint16_t max_data_len, data_len; > + uint32_t nb_segs = 0; > + int i; > + > + temp = *head = dpdk_buf_alloc(mp); > + if (OVS_UNLIKELY(!temp)) { > + return 1; > + } > + > + /* All new allocated mbuf's max data len is the same */ > + max_data_len = temp->buf_len - temp->data_off; > + > + /* Calculate # of output mbufs. */ > + nb_segs = size / max_data_len; > + if (size % max_data_len) { > + nb_segs = nb_segs + 1; > + } > + > + /* Allocate additional mbufs when multiple output mbufs required. > */ > + for (i = 1; i < nb_segs; i++) { > + temp->next = dpdk_buf_alloc(mp); > + if (!temp->next) { > + free_dpdk_buf((struct dp_packet *) *head); > + *head = NULL; > + break; > + } > + temp = temp->next; > + } > + /* We have to do a copy for now */ > + rte_pktmbuf_pkt_len(*head) = size; > + temp = *head; > + > + data_len = size < max_data_len ? size: max_data_len; Can we use max_data_len to copy? It's only valid if rte_pktmbuf_mtod() returns the first byte, but after alloc it's rte_pktmbuf_reset_headroom(). So we will overwrite invalid memory. > + if (packet->source == DPBUF_DPDK) { > + *head = &(packet->mbuf); > + while (temp && head && size > 0) { > + rte_memcpy(rte_pktmbuf_mtod(temp, void *), > + dp_packet_data((struct dp_packet *)head), > data_len); Here you assume source and destination mbuf sizes are the same... Also use container_of for "(struct dp_packet *)head)" > + rte_pktmbuf_data_len(temp) = data_len; > + *head = (*head)->next; > + size = size - data_len; > + data_len = size < max_data_len ? size: max_data_len; > + temp = temp->next; > + } > + } else { Why not use dp_packet_mbuf_write() here? > + int offset = 0; > + while (temp && size > 0) { > + memcpy(rte_pktmbuf_mtod(temp, void *), > + dp_packet_at(packet, offset, data_len), > data_len); > + rte_pktmbuf_data_len(temp) = data_len; > + temp = temp->next; > + size = size - data_len; > + offset += data_len; > + data_len = size < max_data_len ? size: max_data_len; > + } > + } > + return 0; > +} > + > /* Tx function. Transmit packets indefinitely */ > static void > dpdk_do_tx_copy(struct netdev *netdev, int qid, struct > dp_packet_batch *batch) > @@ -2183,6 +2264,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, > struct dp_packet_batch *batch) > struct rte_mbuf *pkts[PKT_ARRAY_SIZE]; > uint32_t cnt = batch_cnt; > uint32_t dropped = 0; > + uint32_t i; > > if (dev->type != DPDK_DEV_VHOST) { > /* Check if QoS has been configured for this netdev. */ > @@ -2193,27 +2275,19 @@ dpdk_do_tx_copy(struct netdev *netdev, int > qid, struct dp_packet_batch *batch) > > uint32_t txcnt = 0; > > - for (uint32_t i = 0; i < cnt; i++) { > + for (i = 0; i < cnt; i++) { > struct dp_packet *packet = batch->packets[i]; > uint32_t size = dp_packet_size(packet); > - > if (OVS_UNLIKELY(size > dev->max_packet_len)) { > VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d", > size, dev->max_packet_len); > - > dropped++; > continue; > } > - > - pkts[txcnt] = rte_pktmbuf_alloc(dev->mp); > - if (OVS_UNLIKELY(!pkts[txcnt])) { > + if (dpdk_prep_tx_buf(packet, &pkts[txcnt], dev->mp)) { > dropped += cnt - i; > break; > } > - > - /* We have to do a copy for now */ > - memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *), > - dp_packet_data(packet), size); > dp_packet_set_size((struct dp_packet *)pkts[txcnt], size); > dp_packet_copy_mbuf_flags((struct dp_packet *)pkts[txcnt], > packet); > > -- > 2.7.4
On 18/06/2018 14:15, Eelco Chaudron wrote: > > > On 11 Jun 2018, at 18:21, Tiago Lam wrote: > >> From: Mark Kavanagh <mark.b.kavanagh@intel.com> >> >> Currently, packets are only copied to a single segment in the function >> dpdk_do_tx_copy(). This could be an issue in the case of jumbo frames, >> particularly when multi-segment mbufs are involved. >> >> This patch calculates the number of segments needed by a packet and >> copies the data to each segment. >> >> A new function, dpdk_buf_alloc(), has also been introduced as a >> wrapper >> around the nonpmd_mp_mutex to serialise allocations from a non-pmd >> context. >> >> Co-authored-by: Michael Qiu <qiudayu@chinac.com> >> Co-authored-by: Tiago Lam <tiago.lam@intel.com> >> >> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> >> Signed-off-by: Michael Qiu <qiudayu@chinac.com> >> Signed-off-by: Tiago Lam <tiago.lam@intel.com> >> --- >> lib/netdev-dpdk.c | 94 >> +++++++++++++++++++++++++++++++++++++++++++++++++------ >> 1 file changed, 84 insertions(+), 10 deletions(-) >> >> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c >> index 9b1fb9a..0079e28 100644 >> --- a/lib/netdev-dpdk.c >> +++ b/lib/netdev-dpdk.c >> @@ -515,6 +515,22 @@ dpdk_rte_mzalloc(size_t sz) >> return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE); >> } >> >> +static struct rte_mbuf * >> +dpdk_buf_alloc(struct rte_mempool *mp) >> +{ >> + if (!dpdk_thread_is_pmd()) { >> + ovs_mutex_lock(&nonpmd_mp_mutex); > Can you explain why the lock is needed here for non PMD threads? >> + } >> + >> + struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mp); >> + >> + if (!dpdk_thread_is_pmd()) { >> + ovs_mutex_unlock(&nonpmd_mp_mutex); >> + } >> + >> + return mbuf; >> +} >> + >> void >> free_dpdk_buf(struct dp_packet *p) >> { >> @@ -2167,6 +2183,71 @@ out: >> } >> } >> >> +static int > > a bool, true for success might be more usefull here. I've updated this part to return better error codes. More specifically, `ENOMEM` so the caller knows there's no memory. > >> +dpdk_prep_tx_buf(struct dp_packet *packet, struct rte_mbuf **head, >> + struct rte_mempool *mp) > > Maybe the function name should be more what it's doing, i.e > dpdk_clone_dp_packet_to_mbuf(). > Sounds clearer to me, aside from the clone since it's different types of packets. What about `dpdk_copy_dp_packet_to_mbuf()`? >> +{ >> + struct rte_mbuf *temp; >> + uint32_t size = dp_packet_size(packet); >> + uint16_t max_data_len, data_len; >> + uint32_t nb_segs = 0; >> + int i; >> + >> + temp = *head = dpdk_buf_alloc(mp); >> + if (OVS_UNLIKELY(!temp)) { >> + return 1; >> + } >> + >> + /* All new allocated mbuf's max data len is the same */ >> + max_data_len = temp->buf_len - temp->data_off; >> + >> + /* Calculate # of output mbufs. */ >> + nb_segs = size / max_data_len; >> + if (size % max_data_len) { >> + nb_segs = nb_segs + 1; >> + } >> + >> + /* Allocate additional mbufs when multiple output mbufs required. >> */ >> + for (i = 1; i < nb_segs; i++) { >> + temp->next = dpdk_buf_alloc(mp); >> + if (!temp->next) { >> + free_dpdk_buf((struct dp_packet *) *head); >> + *head = NULL; >> + break; >> + } >> + temp = temp->next; >> + } >> + /* We have to do a copy for now */ >> + rte_pktmbuf_pkt_len(*head) = size; >> + temp = *head; >> + >> + data_len = size < max_data_len ? size: max_data_len; > > Can we use max_data_len to copy? It's only valid if rte_pktmbuf_mtod() > returns the first byte, but after alloc it's > rte_pktmbuf_reset_headroom(). So we will overwrite invalid memory. > >> + if (packet->source == DPBUF_DPDK) { >> + *head = &(packet->mbuf); >> + while (temp && head && size > 0) { >> + rte_memcpy(rte_pktmbuf_mtod(temp, void *), >> + dp_packet_data((struct dp_packet *)head), >> data_len); > > Here you assume source and destination mbuf sizes are the same... > Also use container_of for "(struct dp_packet *)head)" > >> + rte_pktmbuf_data_len(temp) = data_len; >> + *head = (*head)->next; >> + size = size - data_len; >> + data_len = size < max_data_len ? size: max_data_len; >> + temp = temp->next; >> + } >> + } else { > > Why not use dp_packet_mbuf_write() here? > I missed this. I'll take this approach as well, thanks. Tiago.
On 22 Jun 2018, at 21:05, Lam, Tiago wrote: > On 18/06/2018 14:15, Eelco Chaudron wrote: >> >> >> On 11 Jun 2018, at 18:21, Tiago Lam wrote: >> >>> From: Mark Kavanagh <mark.b.kavanagh@intel.com> >>> >>> Currently, packets are only copied to a single segment in the function >>> dpdk_do_tx_copy(). This could be an issue in the case of jumbo frames, >>> particularly when multi-segment mbufs are involved. >>> >>> This patch calculates the number of segments needed by a packet and >>> copies the data to each segment. >>> >>> A new function, dpdk_buf_alloc(), has also been introduced as a >>> wrapper >>> around the nonpmd_mp_mutex to serialise allocations from a non-pmd >>> context. >>> >>> Co-authored-by: Michael Qiu <qiudayu@chinac.com> >>> Co-authored-by: Tiago Lam <tiago.lam@intel.com> >>> >>> Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com> >>> Signed-off-by: Michael Qiu <qiudayu@chinac.com> >>> Signed-off-by: Tiago Lam <tiago.lam@intel.com> >>> --- >>> lib/netdev-dpdk.c | 94 >>> +++++++++++++++++++++++++++++++++++++++++++++++++------ >>> 1 file changed, 84 insertions(+), 10 deletions(-) >>> >>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c >>> index 9b1fb9a..0079e28 100644 >>> --- a/lib/netdev-dpdk.c >>> +++ b/lib/netdev-dpdk.c >>> @@ -515,6 +515,22 @@ dpdk_rte_mzalloc(size_t sz) >>> return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE); >>> } >>> >>> +static struct rte_mbuf * >>> +dpdk_buf_alloc(struct rte_mempool *mp) >>> +{ >>> + if (!dpdk_thread_is_pmd()) { >>> + ovs_mutex_lock(&nonpmd_mp_mutex); >> Can you explain why the lock is needed here for non PMD threads? >>> + } >>> + >>> + struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mp); >>> + >>> + if (!dpdk_thread_is_pmd()) { >>> + ovs_mutex_unlock(&nonpmd_mp_mutex); >>> + } >>> + >>> + return mbuf; >>> +} >>> + >>> void >>> free_dpdk_buf(struct dp_packet *p) >>> { >>> @@ -2167,6 +2183,71 @@ out: >>> } >>> } >>> >>> +static int >> >> a bool, true for success might be more usefull here. > > I've updated this part to return better error codes. More specifically, > `ENOMEM` so the caller knows there's no memory. > Even better… >> >>> +dpdk_prep_tx_buf(struct dp_packet *packet, struct rte_mbuf **head, >>> + struct rte_mempool *mp) >> >> Maybe the function name should be more what it's doing, i.e >> dpdk_clone_dp_packet_to_mbuf(). >> > > Sounds clearer to me, aside from the clone since it's different types of > packets. What about `dpdk_copy_dp_packet_to_mbuf()`? > Sounds good to me. >>> +{ >>> + struct rte_mbuf *temp; >>> + uint32_t size = dp_packet_size(packet); >>> + uint16_t max_data_len, data_len; >>> + uint32_t nb_segs = 0; >>> + int i; >>> + >>> + temp = *head = dpdk_buf_alloc(mp); >>> + if (OVS_UNLIKELY(!temp)) { >>> + return 1; >>> + } >>> + >>> + /* All new allocated mbuf's max data len is the same */ >>> + max_data_len = temp->buf_len - temp->data_off; >>> + >>> + /* Calculate # of output mbufs. */ >>> + nb_segs = size / max_data_len; >>> + if (size % max_data_len) { >>> + nb_segs = nb_segs + 1; >>> + } >>> + >>> + /* Allocate additional mbufs when multiple output mbufs required. >>> */ >>> + for (i = 1; i < nb_segs; i++) { >>> + temp->next = dpdk_buf_alloc(mp); >>> + if (!temp->next) { >>> + free_dpdk_buf((struct dp_packet *) *head); >>> + *head = NULL; >>> + break; >>> + } >>> + temp = temp->next; >>> + } >>> + /* We have to do a copy for now */ >>> + rte_pktmbuf_pkt_len(*head) = size; >>> + temp = *head; >>> + >>> + data_len = size < max_data_len ? size: max_data_len; >> >> Can we use max_data_len to copy? It's only valid if rte_pktmbuf_mtod() >> returns the first byte, but after alloc it's >> rte_pktmbuf_reset_headroom(). So we will overwrite invalid memory. >> >>> + if (packet->source == DPBUF_DPDK) { >>> + *head = &(packet->mbuf); >>> + while (temp && head && size > 0) { >>> + rte_memcpy(rte_pktmbuf_mtod(temp, void *), >>> + dp_packet_data((struct dp_packet *)head), >>> data_len); >> >> Here you assume source and destination mbuf sizes are the same... >> Also use container_of for "(struct dp_packet *)head)" >> >>> + rte_pktmbuf_data_len(temp) = data_len; >>> + *head = (*head)->next; >>> + size = size - data_len; >>> + data_len = size < max_data_len ? size: max_data_len; >>> + temp = temp->next; >>> + } >>> + } else { >> >> Why not use dp_packet_mbuf_write() here? >> > > I missed this. I'll take this approach as well, thanks. I assume you also mean the other two inlines above this one.
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 9b1fb9a..0079e28 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -515,6 +515,22 @@ dpdk_rte_mzalloc(size_t sz) return rte_zmalloc(OVS_VPORT_DPDK, sz, OVS_CACHE_LINE_SIZE); } +static struct rte_mbuf * +dpdk_buf_alloc(struct rte_mempool *mp) +{ + if (!dpdk_thread_is_pmd()) { + ovs_mutex_lock(&nonpmd_mp_mutex); + } + + struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mp); + + if (!dpdk_thread_is_pmd()) { + ovs_mutex_unlock(&nonpmd_mp_mutex); + } + + return mbuf; +} + void free_dpdk_buf(struct dp_packet *p) { @@ -2167,6 +2183,71 @@ out: } } +static int +dpdk_prep_tx_buf(struct dp_packet *packet, struct rte_mbuf **head, + struct rte_mempool *mp) +{ + struct rte_mbuf *temp; + uint32_t size = dp_packet_size(packet); + uint16_t max_data_len, data_len; + uint32_t nb_segs = 0; + int i; + + temp = *head = dpdk_buf_alloc(mp); + if (OVS_UNLIKELY(!temp)) { + return 1; + } + + /* All new allocated mbuf's max data len is the same */ + max_data_len = temp->buf_len - temp->data_off; + + /* Calculate # of output mbufs. */ + nb_segs = size / max_data_len; + if (size % max_data_len) { + nb_segs = nb_segs + 1; + } + + /* Allocate additional mbufs when multiple output mbufs required. */ + for (i = 1; i < nb_segs; i++) { + temp->next = dpdk_buf_alloc(mp); + if (!temp->next) { + free_dpdk_buf((struct dp_packet *) *head); + *head = NULL; + break; + } + temp = temp->next; + } + /* We have to do a copy for now */ + rte_pktmbuf_pkt_len(*head) = size; + temp = *head; + + data_len = size < max_data_len ? size: max_data_len; + if (packet->source == DPBUF_DPDK) { + *head = &(packet->mbuf); + while (temp && head && size > 0) { + rte_memcpy(rte_pktmbuf_mtod(temp, void *), + dp_packet_data((struct dp_packet *)head), data_len); + rte_pktmbuf_data_len(temp) = data_len; + *head = (*head)->next; + size = size - data_len; + data_len = size < max_data_len ? size: max_data_len; + temp = temp->next; + } + } else { + int offset = 0; + while (temp && size > 0) { + memcpy(rte_pktmbuf_mtod(temp, void *), + dp_packet_at(packet, offset, data_len), data_len); + rte_pktmbuf_data_len(temp) = data_len; + temp = temp->next; + size = size - data_len; + offset += data_len; + data_len = size < max_data_len ? size: max_data_len; + } + } + return 0; +} + /* Tx function. Transmit packets indefinitely */ static void dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) @@ -2183,6 +2264,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) struct rte_mbuf *pkts[PKT_ARRAY_SIZE]; uint32_t cnt = batch_cnt; uint32_t dropped = 0; + uint32_t i; if (dev->type != DPDK_DEV_VHOST) { /* Check if QoS has been configured for this netdev. */ @@ -2193,27 +2275,19 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch) uint32_t txcnt = 0; - for (uint32_t i = 0; i < cnt; i++) { + for (i = 0; i < cnt; i++) { struct dp_packet *packet = batch->packets[i]; uint32_t size = dp_packet_size(packet); - if (OVS_UNLIKELY(size > dev->max_packet_len)) { VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d", size, dev->max_packet_len); - dropped++; continue; } - - pkts[txcnt] = rte_pktmbuf_alloc(dev->mp); - if (OVS_UNLIKELY(!pkts[txcnt])) { + if (dpdk_prep_tx_buf(packet, &pkts[txcnt], dev->mp)) { dropped += cnt - i; break; } - - /* We have to do a copy for now */ - memcpy(rte_pktmbuf_mtod(pkts[txcnt], void *), - dp_packet_data(packet), size); dp_packet_set_size((struct dp_packet *)pkts[txcnt], size); dp_packet_copy_mbuf_flags((struct dp_packet *)pkts[txcnt], packet);