@@ -35,6 +35,7 @@
#include "bitmap.h"
#include "cmap.h"
+#include "ccmap.h"
#include "conntrack.h"
#include "conntrack-tp.h"
#include "coverage.h"
@@ -114,6 +115,7 @@ COVERAGE_DEFINE(datapath_drop_invalid_port);
COVERAGE_DEFINE(datapath_drop_invalid_bond);
COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
+COVERAGE_DEFINE(datapath_direct_output_packet);
/* Protects against changes to 'dp_netdevs'. */
static struct ovs_mutex dp_netdev_mutex = OVS_MUTEX_INITIALIZER;
@@ -543,6 +545,8 @@ struct dp_netdev_flow {
/* Hash table index by unmasked flow. */
const struct cmap_node node; /* In owning dp_netdev_pmd_thread's */
/* 'flow_table'. */
+ const struct cmap_node direct_output_node; /* In dp_netdev_pmd_thread's
+ 'direct_output_table'. */
const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow */
const ovs_u128 ufid; /* Unique flow identifier. */
const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
@@ -556,7 +560,8 @@ struct dp_netdev_flow {
struct ovs_refcount ref_cnt;
bool dead;
- uint32_t mark; /* Unique flow mark assigned to a flow */
+ uint32_t mark; /* Unique flow mark for netdev offloading. */
+ uint64_t direct_output_mark; /* Unique flow mark for direct output. */
/* Statistics. */
struct dp_netdev_flow_stats stats;
@@ -690,12 +695,19 @@ struct dp_netdev_pmd_thread {
/* Flow-Table and classifiers
*
- * Writers of 'flow_table' must take the 'flow_mutex'. Corresponding
- * changes to 'classifiers' must be made while still holding the
- * 'flow_mutex'.
+ * Writers of 'flow_table'/'direct_output_table' and their n* ccmap's must
+ * take the 'flow_mutex'. Corresponding changes to 'classifiers' must be
+ * made while still holding the 'flow_mutex'.
*/
struct ovs_mutex flow_mutex;
struct cmap flow_table OVS_GUARDED; /* Flow table. */
+ struct cmap direct_output_table OVS_GUARDED; /* Flow table with direct
+ output flows only. */
+ struct ccmap n_flows OVS_GUARDED; /* Number of flows in 'flow_table'
+ per in_port. */
+ struct ccmap n_direct_flows OVS_GUARDED; /* Number of flows in
+ 'direct_output_table'
+ per in_port. */
/* One classifier per in_port polled by the pmd */
struct cmap classifiers;
@@ -925,6 +937,24 @@ pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd);
static void queue_netdev_flow_del(struct dp_netdev_pmd_thread *pmd,
struct dp_netdev_flow *flow);
+static void dp_netdev_direct_output_insert(struct dp_netdev_pmd_thread *pmd,
+ struct dp_netdev_flow *flow)
+ OVS_REQUIRES(pmd->flow_mutex);
+static void dp_netdev_direct_output_remove(struct dp_netdev_pmd_thread *pmd,
+ struct dp_netdev_flow *flow)
+ OVS_REQUIRES(pmd->flow_mutex);
+
+static bool dp_netdev_flow_is_direct_output(const struct flow_wildcards *wc,
+ const struct nlattr *actions,
+ size_t actions_len);
+static bool
+dp_netdev_direct_output_enabled(const struct dp_netdev_pmd_thread *pmd,
+ odp_port_t in_port);
+static struct dp_netdev_flow *
+dp_netdev_direct_output_lookup(const struct dp_netdev_pmd_thread *pmd,
+ odp_port_t in_port,
+ ovs_be16 dp_type, uint8_t nw_frag);
+
static void
emc_cache_init(struct emc_cache *flow_cache)
{
@@ -2841,7 +2871,9 @@ dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
ovs_assert(cls != NULL);
dpcls_remove(cls, &flow->cr);
+ dp_netdev_direct_output_remove(pmd, flow);
cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
+ ccmap_dec(&pmd->n_flows, odp_to_u32(in_port));
if (flow->mark != INVALID_FLOW_MARK) {
queue_netdev_flow_del(pmd, flow);
}
@@ -3623,10 +3655,165 @@ dp_netdev_get_mega_ufid(const struct match *match, ovs_u128 *mega_ufid)
odp_flow_key_hash(&masked_flow, sizeof masked_flow, mega_ufid);
}
+static uint64_t
+dp_netdev_direct_output_mark(odp_port_t in_port,
+ ovs_be16 dl_type, uint8_t nw_frag)
+{
+ return ((uint64_t) odp_to_u32(in_port) << 32)
+ | ((uint32_t) ntohs(dl_type) << 16) | nw_frag;
+}
+
+static struct dp_netdev_flow *
+dp_netdev_direct_output_lookup(const struct dp_netdev_pmd_thread *pmd,
+ odp_port_t in_port,
+ ovs_be16 dl_type, uint8_t nw_frag)
+{
+ uint32_t hash;
+ uint64_t mark;
+ struct dp_netdev_flow *flow;
+
+ mark = dp_netdev_direct_output_mark(in_port, dl_type, nw_frag);
+ hash = hash_uint64(mark);
+
+ CMAP_FOR_EACH_WITH_HASH (flow, direct_output_node,
+ hash, &pmd->direct_output_table) {
+ if (flow->direct_output_mark == mark) {
+ VLOG_DBG("Direct output lookup: "
+ "core_id(%d),in_port(%"PRIu32"),mark(0x%"PRIx64") -> %s.",
+ pmd->core_id, in_port, mark, "success");
+ return flow;
+ }
+ }
+ VLOG_DBG("Direct output lookup: "
+ "core_id(%d),in_port(%"PRIu32"),mark(0x%"PRIx64") -> %s.",
+ pmd->core_id, in_port, mark, "fail");
+ return NULL;
+}
+
+static bool
+dp_netdev_direct_output_enabled(const struct dp_netdev_pmd_thread *pmd,
+ odp_port_t in_port)
+{
+ return ccmap_find(&pmd->n_flows, odp_to_u32(in_port))
+ == ccmap_find(&pmd->n_direct_flows, odp_to_u32(in_port));
+}
+
+static void
+dp_netdev_direct_output_insert(struct dp_netdev_pmd_thread *pmd,
+ struct dp_netdev_flow *dp_flow)
+ OVS_REQUIRES(pmd->flow_mutex)
+{
+ uint32_t hash;
+ uint64_t mark;
+ uint8_t nw_frag = dp_flow->flow.nw_frag;
+ ovs_be16 dl_type = dp_flow->flow.dl_type;
+ odp_port_t in_port = dp_flow->flow.in_port.odp_port;
+
+ if (!dp_netdev_flow_ref(dp_flow)) {
+ return;
+ }
+
+ /* Avoid double insertion. Should not happen in practice. */
+ dp_netdev_direct_output_remove(pmd, dp_flow);
+
+ mark = dp_netdev_direct_output_mark(in_port, dl_type, nw_frag);
+ hash = hash_uint64(mark);
+
+ dp_flow->direct_output_mark = mark;
+ cmap_insert(&pmd->direct_output_table,
+ CONST_CAST(struct cmap_node *, &dp_flow->direct_output_node),
+ hash);
+ ccmap_inc(&pmd->n_direct_flows, odp_to_u32(in_port));
+
+ VLOG_DBG("Direct output insert: "
+ "core_id(%d),in_port(%"PRIu32"),mark(0x%"PRIx64").",
+ pmd->core_id, in_port, mark);
+}
+
+static void
+dp_netdev_direct_output_remove(struct dp_netdev_pmd_thread *pmd,
+ struct dp_netdev_flow *dp_flow)
+ OVS_REQUIRES(pmd->flow_mutex)
+{
+ uint32_t hash;
+ uint64_t mark;
+ struct dp_netdev_flow *flow;
+ uint8_t nw_frag = dp_flow->flow.nw_frag;
+ ovs_be16 dl_type = dp_flow->flow.dl_type;
+ odp_port_t in_port = dp_flow->flow.in_port.odp_port;
+
+ mark = dp_netdev_direct_output_mark(in_port, dl_type, nw_frag);
+ hash = hash_uint64(mark);
+
+ flow = dp_netdev_direct_output_lookup(pmd, in_port, dl_type, nw_frag);
+ if (flow) {
+ ovs_assert(dp_flow == flow);
+ VLOG_DBG("Direct output remove: "
+ "core_id(%d),in_port(%"PRIu32"),mark(0x%"PRIx64").",
+ pmd->core_id, in_port, mark);
+ cmap_remove(&pmd->direct_output_table,
+ CONST_CAST(struct cmap_node *, &flow->direct_output_node),
+ hash);
+ ccmap_dec(&pmd->n_direct_flows, odp_to_u32(in_port));
+ dp_netdev_flow_unref(flow);
+ }
+}
+
+static bool
+dp_netdev_flow_is_direct_output(const struct flow_wildcards *wc,
+ const struct nlattr *actions,
+ size_t actions_len)
+{
+ /* Drop flows has no explicit actions. Treat them as direct output. */
+ if (actions && actions_len) {
+ unsigned int left, n_actions = 0;
+ const struct nlattr *a;
+
+ /* Check that there is only one action and it's OUTPUT action. */
+ NL_ATTR_FOR_EACH (a, left, actions, actions_len) {
+ enum ovs_action_attr type = nl_attr_type(a);
+
+ if (++n_actions > 1 || type != OVS_ACTION_ATTR_OUTPUT) {
+ return false;
+ }
+ }
+ }
+
+ /* Check that flow matches only minimal set of fields that always set. */
+ if (wc) {
+ struct flow_wildcards *minimal = xmalloc(sizeof *minimal);
+
+ flow_wildcards_init_catchall(minimal);
+ /* 'dpif-netdev' always has following in exact match:
+ * - recirc_id <-- recirc_id == 0 checked on input.
+ * - in_port <-- will be checked on input.
+ * - packet_type <-- Assuming all packets are PT_ETH.
+ * - dl_type <-- Need to match with.
+ * - vlan_tci <-- No need to match if not asked.
+ * - and nw_frag for ip packets. <-- Need to match for ip packets.
+ */
+ WC_MASK_FIELD(minimal, recirc_id);
+ WC_MASK_FIELD(minimal, in_port);
+ WC_MASK_FIELD(minimal, packet_type);
+ WC_MASK_FIELD(minimal, dl_type);
+ WC_MASK_FIELD(minimal, vlans[0].tci);
+ WC_MASK_FIELD_MASK(minimal, nw_frag, FLOW_NW_FRAG_MASK);
+
+ if (flow_wildcards_has_extra(minimal, wc)) {
+ free(minimal);
+ return false;
+ }
+ free(minimal);
+ }
+
+ return true;
+}
+
static struct dp_netdev_flow *
dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
struct match *match, const ovs_u128 *ufid,
- const struct nlattr *actions, size_t actions_len)
+ const struct nlattr *actions, size_t actions_len,
+ bool vlan_tci_wc_faked)
OVS_REQUIRES(pmd->flow_mutex)
{
struct ds extra_info = DS_EMPTY_INITIALIZER;
@@ -3691,6 +3878,14 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
cmap_insert(&pmd->flow_table, CONST_CAST(struct cmap_node *, &flow->node),
dp_netdev_flow_hash(&flow->ufid));
+ ccmap_inc(&pmd->n_flows, odp_to_u32(in_port));
+
+ if (vlan_tci_wc_faked
+ && match->flow.recirc_id == 0
+ && match->flow.packet_type == htonl(PT_ETH)
+ && dp_netdev_flow_is_direct_output(&match->wc, actions, actions_len)) {
+ dp_netdev_direct_output_insert(pmd, flow);
+ }
queue_netdev_flow_put(pmd, flow, match, actions, actions_len);
@@ -3749,7 +3944,8 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
struct match *match,
ovs_u128 *ufid,
const struct dpif_flow_put *put,
- struct dpif_flow_stats *stats)
+ struct dpif_flow_stats *stats,
+ bool vlan_tci_wc_faked)
{
struct dp_netdev_flow *netdev_flow;
int error = 0;
@@ -3763,7 +3959,7 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
if (!netdev_flow) {
if (put->flags & DPIF_FP_CREATE) {
dp_netdev_flow_add(pmd, match, ufid, put->actions,
- put->actions_len);
+ put->actions_len, vlan_tci_wc_faked);
} else {
error = ENOENT;
}
@@ -3778,6 +3974,12 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd,
old_actions = dp_netdev_flow_get_actions(netdev_flow);
ovsrcu_set(&netdev_flow->actions, new_actions);
+ if (!dp_netdev_flow_is_direct_output(NULL, new_actions->actions,
+ new_actions->size)) {
+ /* New actions are not direct output. */
+ dp_netdev_direct_output_remove(pmd, netdev_flow);
+ }
+
queue_netdev_flow_put(pmd, netdev_flow, match,
put->actions, put->actions_len);
@@ -3819,6 +4021,7 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
ovs_u128 ufid;
int error;
bool probe = put->flags & DPIF_FP_PROBE;
+ bool vlan_tci_wc_faked = false;
if (put->stats) {
memset(put->stats, 0, sizeof *put->stats);
@@ -3857,6 +4060,7 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
* Netlink and struct flow representations, we have to do the same
* here. This must be in sync with 'match' in handle_packet_upcall(). */
if (!match.wc.masks.vlans[0].tci) {
+ vlan_tci_wc_faked = true;
match.wc.masks.vlans[0].tci = htons(0xffff);
}
@@ -3875,7 +4079,7 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
int pmd_error;
pmd_error = flow_put_on_pmd(pmd, &key, &match, &ufid, put,
- &pmd_stats);
+ &pmd_stats, vlan_tci_wc_faked);
if (pmd_error) {
error = pmd_error;
} else if (put->stats) {
@@ -3890,7 +4094,8 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put)
if (!pmd) {
return EINVAL;
}
- error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats);
+ error = flow_put_on_pmd(pmd, &key, &match, &ufid, put, put->stats,
+ vlan_tci_wc_faked);
dp_netdev_pmd_unref(pmd);
}
@@ -6552,6 +6757,9 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
ovs_mutex_init(&pmd->bond_mutex);
cmap_init(&pmd->flow_table);
cmap_init(&pmd->classifiers);
+ cmap_init(&pmd->direct_output_table);
+ ccmap_init(&pmd->n_flows);
+ ccmap_init(&pmd->n_direct_flows);
pmd->ctx.last_rxq = NULL;
pmd_thread_ctx_time_update(pmd);
pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
@@ -6591,6 +6799,9 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
}
cmap_destroy(&pmd->classifiers);
cmap_destroy(&pmd->flow_table);
+ cmap_destroy(&pmd->direct_output_table);
+ ccmap_destroy(&pmd->n_flows);
+ ccmap_destroy(&pmd->n_direct_flows);
ovs_mutex_destroy(&pmd->flow_mutex);
seq_destroy(pmd->reload_seq);
ovs_mutex_destroy(&pmd->port_mutex);
@@ -7099,6 +7310,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
bool smc_enable_db;
size_t map_cnt = 0;
bool batch_enable = true;
+ bool direct_output_enabled = dp_netdev_direct_output_enabled(pmd, port_no);
atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
pmd_perf_update_counter(&pmd->perf_stats,
@@ -7106,7 +7318,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
cnt);
DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
- struct dp_netdev_flow *flow;
+ struct dp_netdev_flow *flow = NULL;
uint32_t mark;
if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
@@ -7124,13 +7336,27 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd,
if (!md_is_valid) {
pkt_metadata_init(&packet->md, port_no);
- }
- if ((*recirc_depth_get() == 0) &&
- dp_packet_has_flow_mark(packet, &mark)) {
- flow = mark_to_flow_find(pmd, mark);
- if (OVS_LIKELY(flow)) {
- tcp_flags = parse_tcp_flags(packet);
+ if (dp_packet_has_flow_mark(packet, &mark)) {
+ flow = mark_to_flow_find(pmd, mark);
+ if (OVS_LIKELY(flow)) {
+ tcp_flags = parse_tcp_flags(packet, NULL, NULL);
+ }
+ }
+
+ if (!flow && direct_output_enabled) {
+ ovs_be16 dl_type = 0;
+ uint8_t nw_frag = 0;
+
+ tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag);
+ flow = dp_netdev_direct_output_lookup(pmd, port_no,
+ dl_type, nw_frag);
+ if (flow) {
+ COVERAGE_INC(datapath_direct_output_packet);
+ }
+ }
+
+ if (flow) {
if (OVS_LIKELY(batch_enable)) {
dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
n_batches);
@@ -7218,6 +7444,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
ovs_u128 ufid;
int error;
uint64_t cycles = cycles_counter_update(&pmd->perf_stats);
+ bool vlan_tci_wc_faked = false;
match.tun_md.valid = false;
miniflow_expand(&key->mf, &match.flow);
@@ -7244,6 +7471,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
* here. This must be in sync with 'match' in dpif_netdev_flow_put(). */
if (!match.wc.masks.vlans[0].tci) {
match.wc.masks.vlans[0].tci = htons(0xffff);
+ vlan_tci_wc_faked = true;
}
/* We can't allow the packet batching in the next loop to execute
@@ -7267,7 +7495,8 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
if (OVS_LIKELY(!netdev_flow)) {
netdev_flow = dp_netdev_flow_add(pmd, &match, &ufid,
add_actions->data,
- add_actions->size);
+ add_actions->size,
+ vlan_tci_wc_faked);
}
ovs_mutex_unlock(&pmd->flow_mutex);
uint32_t hash = dp_netdev_flow_hash(&netdev_flow->ufid);
@@ -1085,11 +1085,14 @@ parse_dl_type(const void **datap, size_t *sizep)
/* Parses and return the TCP flags in 'packet', converted to host byte order.
* If 'packet' is not an Ethernet packet embedding TCP, returns 0.
+ * 'dl_type_p' will be set only if 'packet' is an Ethernet packet.
+ * 'nw_frag_p' will be set only if 'packet' is an IP packet.
*
* The caller must ensure that 'packet' is at least ETH_HEADER_LEN bytes
* long.'*/
uint16_t
-parse_tcp_flags(struct dp_packet *packet)
+parse_tcp_flags(struct dp_packet *packet,
+ ovs_be16 *dl_type_p, uint8_t *nw_frag_p)
{
const void *data = dp_packet_data(packet);
const char *frame = (const char *)data;
@@ -1104,6 +1107,9 @@ parse_tcp_flags(struct dp_packet *packet)
dp_packet_reset_offsets(packet);
dl_type = parse_dl_type(&data, &size);
+ if (dl_type_p) {
+ *dl_type_p = dl_type;
+ }
if (OVS_UNLIKELY(eth_type_mpls(dl_type))) {
packet->l2_5_ofs = (char *)data - frame;
}
@@ -1144,6 +1150,10 @@ parse_tcp_flags(struct dp_packet *packet)
return 0;
}
+ if (nw_frag_p) {
+ *nw_frag_p = nw_frag;
+ }
+
packet->l4_ofs = (uint16_t)((char *)data - frame);
if (!(nw_frag & FLOW_NW_FRAG_LATER) && nw_proto == IPPROTO_TCP &&
size >= TCP_HEADER_LEN) {
@@ -134,8 +134,8 @@ bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto,
uint8_t *nw_frag,
const struct ovs_16aligned_ip6_frag **frag_hdr);
bool parse_nsh(const void **datap, size_t *sizep, struct ovs_key_nsh *key);
-uint16_t parse_tcp_flags(struct dp_packet *packet);
-
+uint16_t parse_tcp_flags(struct dp_packet *packet, ovs_be16 *dl_type_p,
+ uint8_t *nw_frag_p);
static inline uint64_t
flow_get_xreg(const struct flow *flow, int idx)
{