@@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpctl.h \
lib/dp-packet.h \
lib/dp-packet.c \
+ lib/dp-packet-gso.c \
+ lib/dp-packet-gso.h \
lib/dpdk.h \
lib/dpif-netdev-extract-study.c \
lib/dpif-netdev-lookup.h \
new file mode 100644
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2021 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "coverage.h"
+#include "dp-packet.h"
+#include "dp-packet-gso.h"
+#include "netdev-provider.h"
+#include "openvswitch/vlog.h"
+
+VLOG_DEFINE_THIS_MODULE(dp_packet_gso);
+
+COVERAGE_DEFINE(soft_seg_good);
+
+/* Retuns a new packet that is a segment of packet 'p'.
+ *
+ * The new packet is initialized with 'hdr_len' bytes from the
+ * start of packet 'p' and then appended with 'data_len' bytes
+ * from the 'data' buffer.
+ *
+ * Note: The packet headers are not updated. */
+static struct dp_packet *
+dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
+ const char *data, size_t data_len)
+{
+ struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len,
+ dp_packet_headroom(p));
+
+ /* Append the original packet headers and then the payload. */
+ dp_packet_put(seg, dp_packet_data(p), hdr_len);
+ dp_packet_put(seg, data, data_len);
+
+ /* The new segment should have the same offsets. */
+ seg->l2_5_ofs = p->l2_5_ofs;
+ seg->l3_ofs = p->l3_ofs;
+ seg->l4_ofs = p->l4_ofs;
+
+ /* The protocol headers remain the same, so preserve hash and mark. */
+ *dp_packet_rss_ptr(seg) = dp_packet_get_rss_hash(p);
+ *dp_packet_flow_mark_ptr(seg) = *dp_packet_flow_mark_ptr(p);
+
+ /* The segment should inherit all the offloading flags from the
+ * original packet, except for the TCP segmentation, external
+ * buffer and indirect buffer flags. */
+ *dp_packet_ol_flags_ptr(seg) = *dp_packet_ol_flags_ptr(p)
+ & ~(DP_PACKET_OL_TX_TCP_SEG | DP_PACKET_OL_EXTERNAL
+ | DP_PACKET_OL_INDIRECT);
+
+ dp_packet_hwol_reset_tcp_seg(seg);
+
+ return seg;
+}
+
+/* Returns the calculated number of TCP segments in packet 'p'. */
+int
+dp_packet_gso_nr_segs(struct dp_packet *p)
+{
+ uint16_t segsz = dp_packet_get_tso_segsz(p);
+ const char *data_tail;
+ const char *data_pos;
+ int n_segs;
+
+ data_pos = dp_packet_get_tcp_payload(p);
+ data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
+ data_pos = dp_packet_get_tcp_payload(p);
+ n_segs = DIV_ROUND_UP((data_tail - data_pos), segsz);
+
+ return n_segs;
+
+}
+
+/* Perform software segmentation on packet 'p'.
+ *
+ * Returns all the segments added to the array of preallocated
+ * batches in 'batches' starting at batch position 'batch_pos'. */
+void
+dp_packet_gso(struct dp_packet *p, struct dp_packet_batch *batches,
+ size_t *batch_pos)
+{
+ struct tcp_header *tcp_hdr;
+ struct ip_header *ip_hdr;
+ struct dp_packet *seg;
+ uint16_t tcp_offset;
+ uint16_t tso_segsz;
+ uint32_t tcp_seq;
+ uint16_t ip_id;
+ int hdr_len;
+
+ tso_segsz = dp_packet_get_tso_segsz(p);
+ if (!tso_segsz) {
+ VLOG_WARN("GSO packet with len %d with no segment size.",
+ dp_packet_size(p));
+ return;
+ }
+
+ tcp_hdr = dp_packet_l4(p);
+ tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
+ tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
+ hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
+ + tcp_offset * 4;
+ ip_id = 0;
+ if (dp_packet_hwol_is_ipv4(p)) {
+ ip_hdr = dp_packet_l3(p);
+ ip_id = ntohs(ip_hdr->ip_id);
+ }
+
+ const char *data_tail = (char *) dp_packet_tail(p)
+ - dp_packet_l2_pad_size(p);
+ const char *data_pos = dp_packet_get_tcp_payload(p);
+ int n_segs = dp_packet_gso_nr_segs(p);
+ int seg_len;
+ for (int i = 0; i < n_segs; i++) {
+ seg_len = data_tail - data_pos;
+ if (seg_len > tso_segsz) {
+ seg_len = tso_segsz;
+ }
+
+ seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
+ data_pos += seg_len;
+
+ /* Update L3 header. */
+ if (dp_packet_hwol_is_ipv4(seg)) {
+ ip_hdr = dp_packet_l3(seg);
+ ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
+ dp_packet_l4_size(seg));
+ ip_hdr->ip_id = htons(ip_id);
+ ip_hdr->ip_csum = 0;
+ ip_id++;
+ } else {
+ struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
+
+ ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(sizeof *ip_hdr
+ + dp_packet_l4_size(seg));
+
+ }
+
+ /* Update L4 header. */
+ tcp_hdr = dp_packet_l4(seg);
+ put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
+ tcp_seq += seg_len;
+ if (OVS_LIKELY(i < (n_segs - 1))) {
+ /* Reset flags PUSH and FIN unless it is the last segment. */
+ uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
+ & ~(TCP_PSH | TCP_FIN);
+ tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset);
+ }
+
+ if (dp_packet_batch_is_full(&batches[ *batch_pos])) {
+ *batch_pos += 1;
+ }
+
+ dp_packet_batch_add(&batches[ *batch_pos], seg);
+ }
+
+ COVERAGE_INC(soft_seg_good);
+}
new file mode 100644
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2021 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DP_PACKET_GSO_H
+#define DP_PACKET_GSO_H 1
+
+void dp_packet_gso(struct dp_packet *p, struct dp_packet_batch *batches,
+ size_t *batch_pos);
+int dp_packet_gso_nr_segs(struct dp_packet *p);
+
+#endif /* dp-packet-gso.h */
@@ -86,6 +86,10 @@ enum dp_packet_offload_mask {
DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800),
/* Offload IP checksum. */
DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000),
+ /* External Buffer attached. */
+ DEF_OL_FLAG(DP_PACKET_OL_EXTERNAL, RTE_MBUF_F_EXTERNAL, 0x4000),
+ /* Indirect Buffer attached. */
+ DEF_OL_FLAG(DP_PACKET_OL_INDIRECT, RTE_MBUF_F_INDIRECT, 0x8000),
/* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
};
@@ -1131,6 +1135,13 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
*dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
}
+/* Resets TCP Segmentation flag in packet 'p'. */
+static inline void
+dp_packet_hwol_reset_tcp_seg(struct dp_packet *p)
+{
+ *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_TCP_SEG;
+}
+
/* Returns 'true' if the IP header has good integrity and the
* checksum in it is complete. */
static inline bool
@@ -2288,6 +2288,7 @@ static bool
netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
{
struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
+ struct tcp_header *th;
if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK
| RTE_MBUF_F_TX_TCP_SEG))) {
@@ -2299,27 +2300,38 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
mbuf->l4_len = 0;
mbuf->outer_l2_len = 0;
mbuf->outer_l3_len = 0;
+ th = dp_packet_l4(pkt);
if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
- struct tcp_header *th = dp_packet_l4(pkt);
- int hdr_len;
-
if (!th) {
VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
+ " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
+ return false;
+ }
+
+ mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
+ }
+
+ if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) {
+ if (!th) {
+ VLOG_WARN_RL(&rl, "%s: TCP offloading without L4 header"
" pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
return false;
}
mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
- mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
- hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
- if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) {
- VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. "
- "hdr: %"PRIu32", gso: %"PRIu32", max len: %"PRIu32"",
- dev->up.name, hdr_len, mbuf->tso_segsz,
- dev->max_packet_len);
- return false;
+
+ if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+ int hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
+ if (OVS_UNLIKELY((hdr_len +
+ mbuf->tso_segsz) > dev->max_packet_len)) {
+ VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", "
+ "gso: %"PRIu32", max len: %"PRIu32"",
+ dev->up.name, hdr_len, mbuf->tso_segsz,
+ dev->max_packet_len);
+ return false;
+ }
}
if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
@@ -2707,6 +2719,7 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
~(RTE_MBUF_F_EXTERNAL | RTE_MBUF_F_INDIRECT));
+ mbuf_dest->tso_segsz = pkt_orig->mbuf.tso_segsz;
memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
@@ -2765,11 +2778,20 @@ netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch,
struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
size_t cnt, pkt_cnt = dp_packet_batch_size(batch);
+ struct dp_packet *packet;
+ bool need_copy = false;
memset(stats, 0, sizeof *stats);
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ if (packet->source != DPBUF_DPDK) {
+ need_copy = true;
+ break;
+ }
+ }
+
/* Copy dp-packets to mbufs. */
- if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
+ if (OVS_UNLIKELY(need_copy)) {
cnt = dpdk_copy_batch_to_mbuf(netdev, batch);
stats->tx_failure_drops += pkt_cnt - cnt;
pkt_cnt = cnt;
@@ -6849,55 +6849,6 @@ af_packet_sock(void)
return sock;
}
-static int
-netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
-{
- struct eth_header *eth_hdr;
- ovs_be16 eth_type;
- int l2_len;
-
- eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
- if (!eth_hdr) {
- return -EINVAL;
- }
-
- l2_len = ETH_HEADER_LEN;
- eth_type = eth_hdr->eth_type;
- if (eth_type_vlan(eth_type)) {
- struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
-
- if (!vlan) {
- return -EINVAL;
- }
-
- eth_type = vlan->vlan_next_type;
- l2_len += VLAN_HEADER_LEN;
- }
-
- if (eth_type == htons(ETH_TYPE_IP)) {
- struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
-
- if (!ip_hdr) {
- return -EINVAL;
- }
-
- *l4proto = ip_hdr->ip_proto;
- dp_packet_hwol_set_tx_ipv4(b);
- } else if (eth_type == htons(ETH_TYPE_IPV6)) {
- struct ovs_16aligned_ip6_hdr *nh6;
-
- nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
- if (!nh6) {
- return -EINVAL;
- }
-
- *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
- dp_packet_hwol_set_tx_ipv6(b);
- }
-
- return 0;
-}
-
/* Initializes packet 'b' with features enabled in the prepended
* struct virtio_net_hdr. Returns 0 if successful, otherwise a
* positive errno value. */
@@ -6915,15 +6866,6 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b)
}
if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- uint16_t l4proto = 0;
-
- if (netdev_linux_parse_l2(b, &l4proto)) {
- return EINVAL;
- }
-
- if (l4proto == IPPROTO_UDP) {
- dp_packet_hwol_set_csum_udp(b);
- }
/* The packet has offloaded checksum. However, there is no
* additional information like the protocol used, so it would
* require to parse the packet here. The checksum starting point
@@ -35,6 +35,7 @@
#include "coverage.h"
#include "dpif.h"
#include "dp-packet.h"
+#include "dp-packet-gso.h"
#include "openvswitch/dynamic-string.h"
#include "fatal-signal.h"
#include "hash.h"
@@ -56,6 +57,7 @@
#include "svec.h"
#include "openvswitch/vlog.h"
#include "flow.h"
+#include "userspace-tso.h"
#include "util.h"
#ifdef __linux__
#include "tc.h"
@@ -67,7 +69,6 @@ COVERAGE_DEFINE(netdev_received);
COVERAGE_DEFINE(netdev_sent);
COVERAGE_DEFINE(netdev_add_router);
COVERAGE_DEFINE(netdev_get_stats);
-COVERAGE_DEFINE(netdev_send_prepare_drops);
COVERAGE_DEFINE(netdev_push_header_drops);
struct netdev_saved_flags {
@@ -792,60 +793,67 @@ netdev_get_pt_mode(const struct netdev *netdev)
: NETDEV_PT_LEGACY_L2);
}
-/* Check if a 'packet' is compatible with 'netdev_flags'.
- * If a packet is incompatible, return 'false' with the 'errormsg'
- * pointing to a reason. */
-static bool
-netdev_send_prepare_packet(const uint64_t netdev_flags,
- struct dp_packet *packet, char **errormsg)
-{
- if (dp_packet_hwol_is_tso(packet)
- && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
- /* Fall back to GSO in software. */
- VLOG_ERR_BUF(errormsg, "No TSO support");
- return false;
- }
-
- /* Packet with IP csum offloading enabled was received with verified csum.
- * Leave the IP csum offloading enabled even with good checksum to the
- * netdev to decide what would be the best to do.
- * Provide a software fallback in case the device doesn't support IP csum
- * offloading. Note: Encapsulated packet must have the inner IP header
- * csum already calculated.
- * Packet with L4 csum offloading enabled was received with verified csum.
- * Leave the L4 csum offloading enabled even with good checksum for the
- * netdev to decide what would be the best to do.
- * Netdev that requires pseudo header csum needs to calculate that.
- * Provide a software fallback in case the netdev doesn't support L4 csum
- * offloading. Note: Encapsulated packet must have the inner L4 header
- * csum already calculated. */
- dp_packet_ol_send_prepare(packet, netdev_flags);
-
- return true;
-}
-
-/* Check if each packet in 'batch' is compatible with 'netdev' features,
- * otherwise either fall back to software implementation or drop it. */
-static void
-netdev_send_prepare_batch(const struct netdev *netdev,
- struct dp_packet_batch *batch)
+static int
+netdev_send_tso(struct netdev *netdev, int qid,
+ struct dp_packet_batch *batch, bool concurrent_txq)
{
+ struct dp_packet_batch *batches;
struct dp_packet *packet;
- size_t i, size = dp_packet_batch_size(batch);
+ int n_packets;
+ int n_batches;
+ int error;
- DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
- char *errormsg = NULL;
+ /* Calculate the total number of packets in the batch after
+ * the segmentation. */
+ n_packets = 0;
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ if (dp_packet_hwol_is_tso(packet)) {
+ n_packets += dp_packet_gso_nr_segs(packet);
+ } else {
+ n_packets++;
+ }
+ }
- if (netdev_send_prepare_packet(netdev->ol_flags, packet, &errormsg)) {
- dp_packet_batch_refill(batch, packet, i);
+ if (!n_packets) {
+ return 0;
+ }
+
+ /* Allocate enough batches to store all the packets in order. */
+ n_batches = DIV_ROUND_UP(n_packets, NETDEV_MAX_BURST);
+ batches = xmalloc(n_batches * sizeof(struct dp_packet_batch));
+ size_t batch_pos = 0;
+ for (batch_pos = 0; batch_pos < n_batches; batch_pos++) {
+ dp_packet_batch_init(&batches[batch_pos]);
+ }
+ /* Do the packet segmentation if TSO is flagged. */
+ size_t size = dp_packet_batch_size(batch);
+ size_t k;
+ batch_pos = 0;
+ DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
+ if (dp_packet_hwol_is_tso(packet)) {
+ dp_packet_gso(packet, batches, &batch_pos);
} else {
- dp_packet_delete(packet);
- COVERAGE_INC(netdev_send_prepare_drops);
- VLOG_WARN_RL(&rl, "%s: Packet dropped: %s",
- netdev_get_name(netdev), errormsg);
- free(errormsg);
+ if (dp_packet_batch_is_full(&batches[batch_pos])) {
+ batch_pos++;
+ }
+
+ dp_packet_batch_add(&batches[batch_pos], packet);
+ }
+ }
+
+ for (batch_pos = 0; batch_pos < n_batches; batch_pos++) {
+ DP_PACKET_BATCH_FOR_EACH (i, packet, (&batches[batch_pos])) {
+ dp_packet_ol_send_prepare(packet, netdev->ol_flags);
+ }
+
+ error = netdev->netdev_class->send(netdev, qid, &batches[batch_pos],
+ concurrent_txq);
+ if (!error) {
+ COVERAGE_INC(netdev_sent);
}
}
+ free(batches);
+ return 0;
}
/* Sends 'batch' on 'netdev'. Returns 0 if successful (for every packet),
@@ -877,11 +885,21 @@ int
netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
bool concurrent_txq)
{
+ const uint64_t netdev_flags = netdev->ol_flags;
+ struct dp_packet *packet;
int error;
- netdev_send_prepare_batch(netdev, batch);
- if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) {
- return 0;
+ if (userspace_tso_enabled() &&
+ !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ if (dp_packet_hwol_is_tso(packet)) {
+ return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+ }
+ }
+ }
+
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ dp_packet_ol_send_prepare(packet, netdev_flags);
}
error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq);
@@ -427,7 +427,7 @@ add_mpls(struct dp_packet *packet, ovs_be16 ethtype, ovs_be32 lse,
}
if (!l3_encap) {
- struct mpls_hdr *header = dp_packet_push_uninit(packet, MPLS_HLEN);
+ struct mpls_hdr *header = dp_packet_resize_l2(packet, MPLS_HLEN);
put_16aligned_be32(&header->mpls_lse, lse);
packet->l2_5_ofs = 0;
@@ -513,7 +513,7 @@ push_nsh(struct dp_packet *packet, const struct nsh_hdr *nsh_hdr_src)
OVS_NOT_REACHED();
}
- nsh = (struct nsh_hdr *) dp_packet_push_uninit(packet, length);
+ nsh = (struct nsh_hdr *) dp_packet_resize_l2(packet, length);
memcpy(nsh, nsh_hdr_src, length);
nsh->next_proto = next_proto;
packet->packet_type = htonl(PT_NSH);