@@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpctl.h \
lib/dp-packet.h \
lib/dp-packet.c \
+ lib/dp-packet-gso.c \
+ lib/dp-packet-gso.h \
lib/dpdk.h \
lib/dpif-netdev-extract-study.c \
lib/dpif-netdev-lookup.h \
new file mode 100644
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2023 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "dp-packet.h"
+#include "dp-packet-gso.h"
+#include "netdev-provider.h"
+#include "openvswitch/vlog.h"
+
+VLOG_DEFINE_THIS_MODULE(dp_packet_gso);
+
+/* Retuns a new packet that is a segment of packet 'p'.
+ *
+ * The new packet is initialized with 'hdr_len' bytes from the
+ * start of packet 'p' and then appended with 'data_len' bytes
+ * from the 'data' buffer.
+ *
+ * Note: The packet headers are not updated. */
+static struct dp_packet *
+dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len,
+ const char *data, size_t data_len)
+{
+ struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len,
+ dp_packet_headroom(p));
+
+ /* Append the original packet headers and then the payload. */
+ dp_packet_put(seg, dp_packet_data(p), hdr_len);
+ dp_packet_put(seg, data, data_len);
+
+ /* The new segment should have the same offsets. */
+ seg->l2_5_ofs = p->l2_5_ofs;
+ seg->l3_ofs = p->l3_ofs;
+ seg->l4_ofs = p->l4_ofs;
+
+ /* The protocol headers remain the same, so preserve hash and mark. */
+ *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p);
+ *dp_packet_flow_mark_ptr(seg) = *dp_packet_flow_mark_ptr(p);
+
+ /* The segment should inherit all the offloading flags from the
+ * original packet, except for the TCP segmentation, external
+ * buffer and indirect buffer flags. */
+ *dp_packet_ol_flags_ptr(seg) = *dp_packet_ol_flags_ptr(p)
+ & DP_PACKET_OL_SUPPORTED_MASK;
+
+ dp_packet_hwol_reset_tcp_seg(seg);
+
+ return seg;
+}
+
+/* Returns the calculated number of TCP segments in packet 'p'. */
+int
+dp_packet_gso_nr_segs(struct dp_packet *p)
+{
+ uint16_t segsz = dp_packet_get_tso_segsz(p);
+ const char *data_tail;
+ const char *data_pos;
+
+ data_pos = dp_packet_get_tcp_payload(p);
+ data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p);
+
+ return DIV_ROUND_UP((data_tail - data_pos), segsz);
+}
+
+/* Perform software segmentation on packet 'p'.
+ *
+ * Segments packet 'p' into the array of preallocated batches in 'batches',
+ * updating the 'batches' pointer as needed and returns true.
+ *
+ * Returns false if the packet cannot be segmented. */
+bool
+dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ struct dp_packet_batch *curr_batch = *batches;
+ struct tcp_header *tcp_hdr;
+ struct ip_header *ip_hdr;
+ struct dp_packet *seg;
+ uint16_t tcp_offset;
+ uint16_t tso_segsz;
+ uint32_t tcp_seq;
+ uint16_t ip_id;
+ int hdr_len;
+ int seg_len;
+
+ tso_segsz = dp_packet_get_tso_segsz(p);
+ if (!tso_segsz) {
+ VLOG_WARN_RL(&rl, "GSO packet with len %d with no segment size.",
+ dp_packet_size(p));
+ return false;
+ }
+
+ tcp_hdr = dp_packet_l4(p);
+ tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl);
+ tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq));
+ hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p))
+ + tcp_offset * 4;
+ ip_id = 0;
+ if (dp_packet_hwol_is_ipv4(p)) {
+ ip_hdr = dp_packet_l3(p);
+ ip_id = ntohs(ip_hdr->ip_id);
+ }
+
+ const char *data_tail = (char *) dp_packet_tail(p)
+ - dp_packet_l2_pad_size(p);
+ const char *data_pos = dp_packet_get_tcp_payload(p);
+ int n_segs = dp_packet_gso_nr_segs(p);
+
+ for (int i = 0; i < n_segs; i++) {
+ seg_len = data_tail - data_pos;
+ if (seg_len > tso_segsz) {
+ seg_len = tso_segsz;
+ }
+
+ seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len);
+ data_pos += seg_len;
+
+ /* Update L3 header. */
+ if (dp_packet_hwol_is_ipv4(seg)) {
+ ip_hdr = dp_packet_l3(seg);
+ ip_hdr->ip_tot_len = htons(sizeof *ip_hdr +
+ dp_packet_l4_size(seg));
+ ip_hdr->ip_id = htons(ip_id);
+ ip_hdr->ip_csum = 0;
+ ip_id++;
+ } else {
+ struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg);
+
+ ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen
+ = htons(sizeof *ip_hdr + dp_packet_l4_size(seg));
+ }
+
+ /* Update L4 header. */
+ tcp_hdr = dp_packet_l4(seg);
+ put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq));
+ tcp_seq += seg_len;
+ if (OVS_LIKELY(i < (n_segs - 1))) {
+ /* Reset flags PUSH and FIN unless it is the last segment. */
+ uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl)
+ & ~(TCP_PSH | TCP_FIN);
+ tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset);
+ }
+
+ if (dp_packet_batch_is_full(curr_batch)) {
+ curr_batch++;
+ }
+
+ dp_packet_batch_add(curr_batch, seg);
+ }
+
+ *batches = curr_batch;
+ return true;
+}
new file mode 100644
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2023 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DP_PACKET_GSO_H
+#define DP_PACKET_GSO_H 1
+
+bool dp_packet_gso(struct dp_packet *, struct dp_packet_batch **);
+int dp_packet_gso_nr_segs(struct dp_packet *);
+
+#endif /* dp-packet-gso.h */
@@ -1131,6 +1131,13 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
*dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
}
+/* Resets TCP Segmentation flag in packet 'p'. */
+static inline void
+dp_packet_hwol_reset_tcp_seg(struct dp_packet *p)
+{
+ *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_TCP_SEG;
+}
+
/* Returns 'true' if the IP header has good integrity and the
* checksum in it is complete. */
static inline bool
@@ -2438,6 +2438,7 @@ static bool
netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
{
struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
+ struct tcp_header *th;
if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK
| RTE_MBUF_F_TX_TCP_SEG))) {
@@ -2450,27 +2451,36 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
mbuf->l4_len = 0;
mbuf->outer_l2_len = 0;
mbuf->outer_l3_len = 0;
+ th = dp_packet_l4(pkt);
if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
- struct tcp_header *th = dp_packet_l4(pkt);
- int hdr_len;
-
if (!th) {
VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
" pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
return false;
}
+ }
+
+ if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_CKSUM) {
+ if (!th) {
+ VLOG_WARN_RL(&rl, "%s: TCP offloading without L4 header"
+ " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
+ return false;
+ }
mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
- mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
- hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
- if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) {
- VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. "
- "hdr: %"PRIu32", gso: %"PRIu32", max len: %"PRIu32"",
- dev->up.name, hdr_len, mbuf->tso_segsz,
- dev->max_packet_len);
- return false;
+
+ if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
+ int hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
+ if (OVS_UNLIKELY((hdr_len +
+ mbuf->tso_segsz) > dev->max_packet_len)) {
+ VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", "
+ "gso: %"PRIu32", max len: %"PRIu32"",
+ dev->up.name, hdr_len, mbuf->tso_segsz,
+ dev->max_packet_len);
+ return false;
+ }
}
if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
@@ -2858,6 +2868,7 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
~(RTE_MBUF_F_EXTERNAL | RTE_MBUF_F_INDIRECT));
+ mbuf_dest->tso_segsz = pkt_orig->mbuf.tso_segsz;
memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
@@ -2916,11 +2927,20 @@ netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch,
struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets;
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
size_t cnt, pkt_cnt = dp_packet_batch_size(batch);
+ struct dp_packet *packet;
+ bool need_copy = false;
memset(stats, 0, sizeof *stats);
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ if (packet->source != DPBUF_DPDK) {
+ need_copy = true;
+ break;
+ }
+ }
+
/* Copy dp-packets to mbufs. */
- if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
+ if (OVS_UNLIKELY(need_copy)) {
cnt = dpdk_copy_batch_to_mbuf(netdev, batch);
stats->tx_failure_drops += pkt_cnt - cnt;
pkt_cnt = cnt;
@@ -35,6 +35,7 @@
#include "coverage.h"
#include "dpif.h"
#include "dp-packet.h"
+#include "dp-packet-gso.h"
#include "openvswitch/dynamic-string.h"
#include "fatal-signal.h"
#include "hash.h"
@@ -56,6 +57,7 @@
#include "svec.h"
#include "openvswitch/vlog.h"
#include "flow.h"
+#include "userspace-tso.h"
#include "util.h"
#ifdef __linux__
#include "tc.h"
@@ -67,8 +69,9 @@ COVERAGE_DEFINE(netdev_received);
COVERAGE_DEFINE(netdev_sent);
COVERAGE_DEFINE(netdev_add_router);
COVERAGE_DEFINE(netdev_get_stats);
-COVERAGE_DEFINE(netdev_send_prepare_drops);
COVERAGE_DEFINE(netdev_push_header_drops);
+COVERAGE_DEFINE(netdev_soft_seg_good);
+COVERAGE_DEFINE(netdev_soft_seg_drops);
struct netdev_saved_flags {
struct netdev *netdev;
@@ -792,60 +795,84 @@ netdev_get_pt_mode(const struct netdev *netdev)
: NETDEV_PT_LEGACY_L2);
}
-/* Check if a 'packet' is compatible with 'netdev_flags'.
- * If a packet is incompatible, return 'false' with the 'errormsg'
- * pointing to a reason. */
-static bool
-netdev_send_prepare_packet(const uint64_t netdev_flags,
- struct dp_packet *packet, char **errormsg)
-{
- if (dp_packet_hwol_is_tso(packet)
- && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
- /* Fall back to GSO in software. */
- VLOG_ERR_BUF(errormsg, "No TSO support");
- return false;
- }
-
- /* Packet with IP csum offloading enabled was received with verified csum.
- * Leave the IP csum offloading enabled even with good checksum to the
- * netdev to decide what would be the best to do.
- * Provide a software fallback in case the device doesn't support IP csum
- * offloading. Note: Encapsulated packet must have the inner IP header
- * csum already calculated.
- * Packet with L4 csum offloading enabled was received with verified csum.
- * Leave the L4 csum offloading enabled even with good checksum for the
- * netdev to decide what would be the best to do.
- * Netdev that requires pseudo header csum needs to calculate that.
- * Provide a software fallback in case the netdev doesn't support L4 csum
- * offloading. Note: Encapsulated packet must have the inner L4 header
- * csum already calculated. */
- dp_packet_ol_send_prepare(packet, netdev_flags);
-
- return true;
-}
-
-/* Check if each packet in 'batch' is compatible with 'netdev' features,
- * otherwise either fall back to software implementation or drop it. */
-static void
-netdev_send_prepare_batch(const struct netdev *netdev,
- struct dp_packet_batch *batch)
+/* Attempts to segment GSO flagged packets and send them as multiple bundles.
+ * This function is only used if at least one packet in the current batch is
+ * flagged for TSO and the netdev does not support this.
+ *
+ * The return value is 0 if all batches sent successfully, and an error code
+ * from netdev_class->send() if at least one batch failed to send. */
+static int
+netdev_send_tso(struct netdev *netdev, int qid,
+ struct dp_packet_batch *batch, bool concurrent_txq)
{
+ struct dp_packet_batch *batches;
struct dp_packet *packet;
- size_t i, size = dp_packet_batch_size(batch);
-
- DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
- char *errormsg = NULL;
+ int retval = 0;
+ int n_packets;
+ int n_batches;
+ int error;
- if (netdev_send_prepare_packet(netdev->ol_flags, packet, &errormsg)) {
- dp_packet_batch_refill(batch, packet, i);
+ /* Calculate the total number of packets in the batch after
+ * the segmentation. */
+ n_packets = 0;
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ if (dp_packet_hwol_is_tso(packet)) {
+ n_packets += dp_packet_gso_nr_segs(packet);
} else {
+ n_packets++;
+ }
+ }
+
+ if (!n_packets) {
+ return 0;
+ }
+
+ /* Allocate enough batches to store all the packets in order. */
+ n_batches = DIV_ROUND_UP(n_packets, NETDEV_MAX_BURST);
+ batches = xmalloc(n_batches * sizeof *batches);
+
+ struct dp_packet_batch *curr_batch = batches;
+ struct dp_packet_batch *last_batch = &batches[n_batches - 1];
+ for (curr_batch = batches; curr_batch <= last_batch; curr_batch++) {
+ dp_packet_batch_init(curr_batch);
+ }
+
+ /* Do the packet segmentation if TSO is flagged. */
+ size_t size = dp_packet_batch_size(batch);
+ size_t k;
+ curr_batch = batches;
+ DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) {
+ if (dp_packet_hwol_is_tso(packet)) {
+ if (dp_packet_gso(packet, &curr_batch)) {
+ COVERAGE_INC(netdev_soft_seg_good);
+ } else {
+ COVERAGE_INC(netdev_soft_seg_drops);
+ }
dp_packet_delete(packet);
- COVERAGE_INC(netdev_send_prepare_drops);
- VLOG_WARN_RL(&rl, "%s: Packet dropped: %s",
- netdev_get_name(netdev), errormsg);
- free(errormsg);
+ } else {
+ if (dp_packet_batch_is_full(curr_batch)) {
+ curr_batch++;
+ }
+
+ dp_packet_batch_add(curr_batch, packet);
+ }
+ }
+
+ for (curr_batch = batches; curr_batch <= last_batch; curr_batch++) {
+ DP_PACKET_BATCH_FOR_EACH (i, packet, curr_batch) {
+ dp_packet_ol_send_prepare(packet, netdev->ol_flags);
+ }
+
+ error = netdev->netdev_class->send(netdev, qid, curr_batch,
+ concurrent_txq);
+ if (!error) {
+ COVERAGE_INC(netdev_sent);
+ } else {
+ retval = error;
}
}
+ free(batches);
+ return retval;
}
/* Sends 'batch' on 'netdev'. Returns 0 if successful (for every packet),
@@ -877,11 +904,21 @@ int
netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
bool concurrent_txq)
{
+ const uint64_t netdev_flags = netdev->ol_flags;
+ struct dp_packet *packet;
int error;
- netdev_send_prepare_batch(netdev, batch);
- if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) {
- return 0;
+ if (userspace_tso_enabled() &&
+ !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ if (dp_packet_hwol_is_tso(packet)) {
+ return netdev_send_tso(netdev, qid, batch, concurrent_txq);
+ }
+ }
+ }
+
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ dp_packet_ol_send_prepare(packet, netdev_flags);
}
error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq);
@@ -427,7 +427,7 @@ add_mpls(struct dp_packet *packet, ovs_be16 ethtype, ovs_be32 lse,
}
if (!l3_encap) {
- struct mpls_hdr *header = dp_packet_push_uninit(packet, MPLS_HLEN);
+ struct mpls_hdr *header = dp_packet_resize_l2(packet, MPLS_HLEN);
put_16aligned_be32(&header->mpls_lse, lse);
packet->l2_5_ofs = 0;
@@ -513,7 +513,7 @@ push_nsh(struct dp_packet *packet, const struct nsh_hdr *nsh_hdr_src)
OVS_NOT_REACHED();
}
- nsh = (struct nsh_hdr *) dp_packet_push_uninit(packet, length);
+ nsh = (struct nsh_hdr *) dp_packet_resize_l2(packet, length);
memcpy(nsh, nsh_hdr_src, length);
nsh->next_proto = next_proto;
packet->packet_type = htonl(PT_NSH);
@@ -1,5 +1,50 @@
AT_BANNER([datapath-sanity])
+AT_SETUP([datapath - netdev offload software fallback])
+AT_SKIP_IF([test $HAVE_NC = no])
+OVS_TRAFFIC_VSWITCHD_START()
+
+AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+
+dnl Test the case where only one side has all checksum and tso offload disabled
+ethtool -K ovs-p1 tso off
+ethtool -K ovs-p1 sg off
+
+dnl Reinitialize
+ovs-vsctl del-port ovs-p1
+ovs-vsctl add-port br0 ovs-p1
+
+NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+NS_CHECK_EXEC([at_ns1], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+NETNS_DAEMONIZE([at_ns0], [nc -l 1234 > data_0], [nc1.pid])
+NETNS_DAEMONIZE([at_ns1], [nc -l 1234 > data_1], [nc2.pid])
+
+AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null])
+on_exit 'rm -f payload.bin'
+
+NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.2 1234 < payload.bin])
+NS_CHECK_EXEC([at_ns1], [nc $NC_EOF_OPT 10.1.1.1 1234 < payload.bin])
+
+dnl Wait until transfer completes
+OVS_WAIT_WHILE([kill -0 `cat nc1.pid` `cat nc2.pid`])
+
+AT_CHECK([diff -q payload.bin data_0], [0], [])
+AT_CHECK([diff -q payload.bin data_1], [0], [])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
AT_SETUP([datapath - ping between two ports])
OVS_TRAFFIC_VSWITCHD_START()