@@ -88,6 +88,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
#define MAX_RECIRC_DEPTH 6
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
+/* Use instant packet send by default. */
+#define DEFAULT_TX_FLUSH_INTERVAL 0
+
/* Configuration parameters. */
enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
@@ -274,6 +277,9 @@ struct dp_netdev {
struct hmap ports;
struct seq *port_seq; /* Incremented whenever a port changes. */
+ /* The time that a packet can wait in output batch for sending. */
+ atomic_uint32_t tx_flush_interval;
+
/* Meters. */
struct ovs_mutex meter_locks[N_METER_LOCKS];
struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
@@ -503,6 +509,7 @@ struct tx_port {
int qid;
long long last_used;
struct hmap_node node;
+ long long flush_time;
struct dp_packet_batch output_pkts;
struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
};
@@ -588,6 +595,9 @@ struct dp_netdev_pmd_thread {
* than 'cmap_count(dp->poll_threads)'. */
uint32_t static_tx_qid;
+ /* Number of filled output batches. */
+ int n_output_batches;
+
struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
/* List of rx queues to poll. */
struct hmap poll_list OVS_GUARDED;
@@ -677,8 +687,9 @@ static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
struct rxq_poll *poll)
OVS_REQUIRES(pmd->port_mutex);
-static void
-dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd);
+static int
+dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
+ bool force);
static void reconfigure_datapath(struct dp_netdev *dp)
OVS_REQUIRES(dp->port_mutex);
@@ -1344,6 +1355,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
conntrack_init(&dp->conntrack);
atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
+ atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
cmap_init(&dp->poll_threads);
@@ -3010,7 +3022,7 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
dp_packet_batch_init_packet(&pp, execute->packet);
dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
execute->actions, execute->actions_len);
- dp_netdev_pmd_flush_output_packets(pmd);
+ dp_netdev_pmd_flush_output_packets(pmd, true);
if (pmd->core_id == NON_PMD_CORE_ID) {
ovs_mutex_unlock(&dp->non_pmd_mutex);
@@ -3059,6 +3071,16 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
smap_get_ullong(other_config, "emc-insert-inv-prob",
DEFAULT_EM_FLOW_INSERT_INV_PROB);
uint32_t insert_min, cur_min;
+ uint32_t tx_flush_interval, cur_tx_flush_interval;
+
+ tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
+ DEFAULT_TX_FLUSH_INTERVAL);
+ atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
+ if (tx_flush_interval != cur_tx_flush_interval) {
+ atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
+ VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
+ tx_flush_interval);
+ }
if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
free(dp->pmd_cmask);
@@ -3328,13 +3350,14 @@ dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
return processing_cycles;
}
-static void
+static int
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
struct tx_port *p)
{
int tx_qid;
int output_cnt;
bool dynamic_txqs;
+ uint32_t tx_flush_interval;
enum pmd_cycles_counter_type save_pmd_cycles_type;
/* In case we're in PMD_CYCLES_PROCESSING state we need to count
@@ -3354,10 +3377,18 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
}
output_cnt = dp_packet_batch_size(&p->output_pkts);
+ ovs_assert(output_cnt > 0);
netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
dp_packet_batch_init(&p->output_pkts);
+ /* Update time of the next flush. */
+ atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
+ p->flush_time = pmd->ctx.now + tx_flush_interval;
+
+ ovs_assert(pmd->n_output_batches > 0);
+ pmd->n_output_batches--;
+
pmd_perf_update_counter(&pmd->perf_stats,
PMD_STAT_SENT_PKTS, output_cnt);
pmd_perf_update_counter(&pmd->perf_stats,
@@ -3366,29 +3397,39 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
/* Update send cycles for all the rx queues and restore previous state. */
cycles_count_intermediate(pmd, p->output_pkts_rxqs, output_cnt);
pmd->ctx.current_pmd_cycles_type = save_pmd_cycles_type;
+ return output_cnt;
}
-static void
-dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd)
+static int
+dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
+ bool force)
{
struct tx_port *p;
+ int output_cnt = 0;
+
+ if (!pmd->n_output_batches) {
+ return 0;
+ }
HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
- if (!dp_packet_batch_is_empty(&p->output_pkts)) {
- dp_netdev_pmd_flush_output_on_port(pmd, p);
+ if (!dp_packet_batch_is_empty(&p->output_pkts)
+ && (force || pmd->ctx.now >= p->flush_time)) {
+ output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
}
}
+ return output_cnt;
}
static int
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
struct dp_netdev_rxq *rxq,
- odp_port_t port_no)
+ odp_port_t port_no,
+ bool *flushed)
{
struct pmd_perf_stats *s = &pmd->perf_stats;
struct dp_packet_batch batch;
int error;
- int batch_cnt = 0;
+ int batch_cnt = 0, output_cnt = 0;
dp_packet_batch_init(&batch);
@@ -3422,7 +3463,8 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
dp_netdev_input(pmd, &batch, port_no);
cycles_count_intermediate(pmd, &rxq, 1);
- dp_netdev_pmd_flush_output_packets(pmd);
+ output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
+ *flushed = true;
} else if (error != EAGAIN && error != EOPNOTSUPP) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
@@ -4057,6 +4099,7 @@ dpif_netdev_run(struct dpif *dpif)
struct dp_netdev *dp = get_dp_netdev(dpif);
struct dp_netdev_pmd_thread *non_pmd;
uint64_t new_tnl_seq;
+ bool flushed = false;
ovs_mutex_lock(&dp->port_mutex);
non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
@@ -4070,12 +4113,20 @@ dpif_netdev_run(struct dpif *dpif)
for (i = 0; i < port->n_rxq; i++) {
dp_netdev_process_rxq_port(non_pmd,
&port->rxqs[i],
- port->port_no);
+ port->port_no,
+ &flushed);
}
}
}
+ if (!flushed) {
+ /* We didn't receive anything in the process loop.
+ * Check if we need to send something.
+ * There was no time updates on current iteration. */
+ pmd_thread_ctx_time_update(non_pmd);
+ dp_netdev_pmd_flush_output_packets(non_pmd, false);
+ }
+
cycles_count_end(non_pmd);
- pmd_thread_ctx_time_update(non_pmd);
dpif_netdev_xps_revalidate_pmd(non_pmd, false);
ovs_mutex_unlock(&dp->non_pmd_mutex);
@@ -4126,6 +4177,8 @@ pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
{
struct tx_port *tx_port_cached;
+ /* Flush all the queued packets. */
+ dp_netdev_pmd_flush_output_packets(pmd, true);
/* Free all used tx queue ids. */
dpif_netdev_xps_revalidate_pmd(pmd, true);
@@ -4255,25 +4308,32 @@ reload:
cycles_count_start(pmd);
for (;;) {
uint64_t iter_packets = 0;
+ bool flushed = false;
pmd_perf_start_iteration(s, pmd->ctx.last_cycles);
for (i = 0; i < poll_cnt; i++) {
int rxq_packets =
dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
- poll_list[i].port_no);
+ poll_list[i].port_no,
+ &flushed);
iter_packets += rxq_packets;
}
+ if (!flushed) {
+ /* We didn't receive anything in the process loop.
+ * Check if we need to send something.
+ * There was no time updates on current iteration. */
+ pmd_thread_ctx_time_update(pmd);
+ dp_netdev_pmd_flush_output_packets(pmd, false);
+ }
+
if (lc++ > 1024) {
bool reload;
lc = 0;
coverage_try_clear();
- /* It's possible that the time was not updated on current
- * iteration, if there were no received packets. */
- pmd_thread_ctx_time_update(pmd);
dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
if (!ovsrcu_try_quiesce()) {
emc_cache_slow_sweep(&pmd->flow_cache);
@@ -4717,6 +4777,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
pmd->core_id = core_id;
pmd->numa_id = numa_id;
pmd->need_reload = false;
+ pmd->n_output_batches = 0;
ovs_refcount_init(&pmd->ref_cnt);
latch_init(&pmd->exit_latch);
@@ -4907,6 +4968,7 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
tx->port = port;
tx->qid = -1;
+ tx->flush_time = 0LL;
dp_packet_batch_init(&tx->output_pkts);
hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
@@ -5610,12 +5672,16 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
dp_netdev_pmd_flush_output_on_port(pmd, p);
}
#endif
- if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts)
- + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST)) {
- /* Some packets was generated while input batch processing.
- * Flush here to avoid overflow. */
+ if (dp_packet_batch_size(&p->output_pkts)
+ + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
+ /* Flush here to avoid overflow. */
dp_netdev_pmd_flush_output_on_port(pmd, p);
}
+
+ if (dp_packet_batch_is_empty(&p->output_pkts)) {
+ pmd->n_output_batches++;
+ }
+
DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
pmd->ctx.last_rxq;
@@ -359,6 +359,22 @@
</p>
</column>
+ <column name="other_config" key="tx-flush-interval"
+ type='{"type": "integer",
+ "minInteger": 0, "maxInteger": 1000000}'>
+ <p>
+ Specifies the time in microseconds that a packet can wait in output
+ batch for sending i.e. amount of time that packet can spend in an
+ intermediate output queue before sending to netdev.
+ This option can be used to configure balance between throughput
+ and latency. Lower values decreases latency while higher values
+ may be useful to achieve higher performance.
+ </p>
+ <p>
+ Defaults to 0 i.e. instant packet sending (latency optimized).
+ </p>
+ </column>
+
<column name="other_config" key="n-handler-threads"
type='{"type": "integer", "minInteger": 1}'>
<p>