diff mbox

[LEDE-DEV] Concern over kernel: backport patches improving fq_codel drop behavior

Message ID 5735A51E.1010203@darbyshire-bryant.me.uk
State RFC
Headers show

Commit Message

Kevin Darbyshire-Bryant May 13, 2016, 9:57 a.m. UTC
Hi All,

I've just seen
https://github.com/lede-project/source/commit/fad8bdfa40df8636a52d770bbab010086c1976ec
go through and wish to raise a concern:

The patch introduces a kernel API change 'qdisc_tree_decrease_qlen'
replaces 'qdisc_tree_decrease_qlen' without a corresponding kernel
version number change.  This will cause kmod_sched_cake to fail to build
on lede 4.4 kernels.  A patch for kmod-sched-cake could be done, however
that would make the package incompatible with openwrt until it also
carries the above kernel patch (it would fail the 'other way around')

Personally, I think the patch is too invasive until an official backport
is done (with corresponding kernel revision which can be tested for by
kmod-sched-cake)

I'm *very* much *for* the fq_codel batch drop functionality and that
ideally should be in a lede 4.4 patch, without the API change.  I attach
a patch that I think more suitable...or would form the basis of
something more suitable at least.

Apologies for the somewhat rush job on this.... I'm 10 minutes from
rushing out of the door for a week holiday :-)

Kevin

Comments

Felix Fietkau May 16, 2016, 10:46 a.m. UTC | #1
On 2016-05-13 11:57, Kevin Darbyshire-Bryant wrote:
> Hi All,
> 
> I've just seen
> https://github.com/lede-project/source/commit/fad8bdfa40df8636a52d770bbab010086c1976ec
> go through and wish to raise a concern:
> 
> The patch introduces a kernel API change 'qdisc_tree_decrease_qlen'
> replaces 'qdisc_tree_decrease_qlen' without a corresponding kernel
> version number change.  This will cause kmod_sched_cake to fail to build
> on lede 4.4 kernels.  A patch for kmod-sched-cake could be done, however
> that would make the package incompatible with openwrt until it also
> carries the above kernel patch (it would fail the 'other way around')
> 
> Personally, I think the patch is too invasive until an official backport
> is done (with corresponding kernel revision which can be tested for by
> kmod-sched-cake)
> 
> I'm *very* much *for* the fq_codel batch drop functionality and that
> ideally should be in a lede 4.4 patch, without the API change.  I attach
> a patch that I think more suitable...or would form the basis of
> something more suitable at least.
> 
> Apologies for the somewhat rush job on this.... I'm 10 minutes from
> rushing out of the door for a week holiday :-)
What do you think about moving cake to the LEDE source tree instead?

- Felix
Kevin Darbyshire-Bryant May 16, 2016, 6:49 p.m. UTC | #2
On 16/05/16 11:46, Felix Fietkau wrote:
> On 2016-05-13 11:57, Kevin Darbyshire-Bryant wrote:
>> Hi All,
<snip>
>>
>> I'm *very* much *for* the fq_codel batch drop functionality and that
>> ideally should be in a lede 4.4 patch, without the API change.  I attach
>> a patch that I think more suitable...or would form the basis of
>> something more suitable at least.
>>
>> Apologies for the somewhat rush job on this.... I'm 10 minutes from
>> rushing out of the door for a week holiday :-)
> What do you think about moving cake to the LEDE source tree instead?
>
> - Felix
>

That's certainly a plan!  I'll do a PR next week :-)

Kevin
diff mbox

Patch

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 1c78c7454c7c..a11afecd4482 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -718,6 +718,7 @@  enum {
 	TCA_FQ_CODEL_FLOWS,
 	TCA_FQ_CODEL_QUANTUM,
 	TCA_FQ_CODEL_CE_THRESHOLD,
+	TCA_FQ_CODEL_DROP_BATCH_SIZE,
 	__TCA_FQ_CODEL_MAX
 };
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index a5e420b3d4ab..e7b42b0d5145 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -59,6 +59,7 @@  struct fq_codel_sched_data {
 	u32		flows_cnt;	/* number of flows */
 	u32		perturbation;	/* hash perturbation */
 	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */
+	u32		drop_batch_size;
 	struct codel_params cparams;
 	struct codel_stats cstats;
 	u32		drop_overlimit;
@@ -135,17 +136,20 @@  static inline void flow_queue_add(struct fq_codel_flow *flow,
 	skb->next = NULL;
 }
 
-static unsigned int fq_codel_drop(struct Qdisc *sch)
+static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 	unsigned int maxbacklog = 0, idx = 0, i, len;
 	struct fq_codel_flow *flow;
+	unsigned int threshold;
 
-	/* Queue is full! Find the fat flow and drop packet from it.
+	/* Queue is full! Find the fat flow and drop packet(s) from it.
 	 * This might sound expensive, but with 1024 flows, we scan
 	 * 4KB of memory, and we dont need to handle a complex tree
 	 * in fast path (packet queue/enqueue) with many cache misses.
+	 * In stress mode, we'll try to drop 64 packets from the flow,
+	 * amortizing this linear lookup to one cache line per drop.
 	 */
 	for (i = 0; i < q->flows_cnt; i++) {
 		if (q->backlogs[i] > maxbacklog) {
@@ -153,15 +157,24 @@  static unsigned int fq_codel_drop(struct Qdisc *sch)
 			idx = i;
 		}
 	}
+
+	/* Our goal is to drop half of this fat flow backlog */
+	threshold = maxbacklog >> 1;
+
 	flow = &q->flows[idx];
-	skb = dequeue_head(flow);
-	len = qdisc_pkt_len(skb);
+	len = 0;
+	i = 0;
+	do {
+		skb = dequeue_head(flow);
+		len += qdisc_pkt_len(skb);
+		kfree_skb(skb);
+	} while (++i < max_packets && len < threshold);
+
+	flow->dropped += i;
 	q->backlogs[idx] -= len;
-	sch->q.qlen--;
-	qdisc_qstats_drop(sch);
-	qdisc_qstats_backlog_dec(sch, skb);
-	kfree_skb(skb);
-	flow->dropped++;
+	sch->qstats.drops += i;
+	sch->qstats.backlog -= len;
+	sch->q.qlen -= i;
 	return idx;
 }
 
@@ -170,14 +183,14 @@  static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
 	unsigned int prev_backlog;
 
 	prev_backlog = sch->qstats.backlog;
-	fq_codel_drop(sch);
+	fq_codel_drop(sch, 1U);
 	return prev_backlog - sch->qstats.backlog;
 }
 
 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
-	unsigned int idx;
+	unsigned int idx, prev_backlog, prev_qlen;
 	struct fq_codel_flow *flow;
 	int uninitialized_var(ret);
 
@@ -206,16 +219,22 @@  static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
	if (++sch->q.qlen <= sch->limit)
 		return NET_XMIT_SUCCESS;
 
+ 	prev_backlog = sch->qstats.backlog;
-	q->drop_overlimit++;
-	/* Return Congestion Notification only if we dropped a packet
-	 * from this flow.
+	prev_qlen = sch->q.qlen;
+
+	/* fq_codel_drop() is quite expensive, as it performs a linear search
+	 * in q->backlogs[] to find a fat flow.
+	 * So instead of dropping a single packet, drop half of its backlog
+	 * with a 64 packets limit to not add a too big cpu spike here.
 	 */
-	if (fq_codel_drop(sch) == idx)
-		return NET_XMIT_CN;
+	ret = fq_codel_drop(sch, q->drop_batch_size);
+
+	q->drop_overlimit += prev_qlen - sch->q.qlen;
 
-	/* As we dropped a packet, better let upper stack know this */
-	qdisc_tree_decrease_qlen(sch, 1);
-	return NET_XMIT_SUCCESS;
+	/* As we dropped packet(s), better let upper stack know this */
+	qdisc_tree_decrease_qlen(sch, prev_qlen - sch->q.qlen);
+
+	return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS;
 }
 
 /* This is the specific function called from codel_dequeue()
@@ -335,6 +354,7 @@  static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
 	[TCA_FQ_CODEL_FLOWS]	= { .type = NLA_U32 },
 	[TCA_FQ_CODEL_QUANTUM]	= { .type = NLA_U32 },
 	[TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 },
+	[TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 },
 };
 
 static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
@@ -386,6 +406,9 @@  static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
 	if (tb[TCA_FQ_CODEL_QUANTUM])
 		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
 
+	if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])
+		q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]));
+
 	while (sch->q.qlen > sch->limit) {
 		struct sk_buff *skb = fq_codel_dequeue(sch);
 
@@ -431,6 +454,7 @@  static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 
 	sch->limit = 10*1024;
 	q->flows_cnt = 1024;
+	q->drop_batch_size = 64;
 	q->quantum = psched_mtu(qdisc_dev(sch));
 	q->perturbation = prandom_u32();
 	INIT_LIST_HEAD(&q->new_flows);
@@ -489,6 +513,8 @@  static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
 			q->cparams.ecn) ||
 	    nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
 			q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE,
+			q->drop_batch_size) ||
 	    nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
 			q->flows_cnt))
 		goto nla_put_failure;