diff mbox series

[RFC,net-next,2/2] net: sched: fq_pie: Flow Queue PIE AQM

Message ID 20190331174005.5841-3-gautamramk@gmail.com
State RFC
Delegated to: David Miller
Headers show
Series net: sched: add Flow Queue PIE AQM | expand

Commit Message

Gautam Ramakrishnan March 31, 2019, 5:40 p.m. UTC
From: Mohit P. Tahiliani <tahiliani@nitk.edu.in>

FQ-PIE incorporates fair/flow queuing in which every queue
is managed by an instance of PIE queueing discipline.
The algorithm provides good control over the queueing delay
while at the same time, ensures fairness across various
flows sharing the same link.

Principles:
  - Packets are classified stochastically on flows.
  - Each flow has a PIE managed queue.
  - Flows are linked onto two (Round Robin) lists,
    so that new flows have priority on old ones.
  - For a given flow, packets are dropped on arrival according
    to a drop probability.
  - Tail drops only.

Usage:
tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ]
                    [ alpha NUMBER ] [ beta NUMBER ]
                    [ target TIME us ] [ tupdate TIME us ]
		    [ bytemode ] [ quantum BYTES ]
		    [ ecn | ecn_prob PERCENTAGE ]

defaults: 1024 flows, 10240 packets limit, quantum : device MTU
           target: 15ms
           tupdate: 15ms
           alpha: 2 (on a scale of 0 to 16)
           beta: 20 (on a scale of 0 to 16)
           ecn: false
           ecn_prob: 10%

Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
Signed-off-by: Sachin D. Patil <sdp.sachin@gmail.com>
Signed-off-by: Mohit Bhasi <mohitbhasi1998@gmail.com>
Signed-off-by: V. Saicharan <vsaicharan1998@gmail.com>
Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
Signed-off-by: Gautam Ramakrishnan <gautamramk@gmail.com>
Cc: Dave Taht <dave.taht@gmail.com>
---
 include/uapi/linux/pkt_sched.h |  28 ++
 net/sched/Kconfig              |  14 +-
 net/sched/Makefile             |   1 +
 net/sched/sch_fq_pie.c         | 485 +++++++++++++++++++++++++++++++++
 4 files changed, 527 insertions(+), 1 deletion(-)
 create mode 100644 net/sched/sch_fq_pie.c

Comments

Toke Høiland-Jørgensen April 2, 2019, 10:49 a.m. UTC | #1
Some suggestions below to make fq_pie and fq_codel more similar (ref. my
previous email).

Also, a few unrelated nits.

> From: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
>
> FQ-PIE incorporates fair/flow queuing in which every queue
> is managed by an instance of PIE queueing discipline.
> The algorithm provides good control over the queueing delay
> while at the same time, ensures fairness across various
> flows sharing the same link.
>
> Principles:
>   - Packets are classified stochastically on flows.
>   - Each flow has a PIE managed queue.
>   - Flows are linked onto two (Round Robin) lists,
>     so that new flows have priority on old ones.
>   - For a given flow, packets are dropped on arrival according
>     to a drop probability.
>   - Tail drops only.

Why tail drops only?

> Usage:
> tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ]
>                     [ alpha NUMBER ] [ beta NUMBER ]
>                     [ target TIME us ] [ tupdate TIME us ]
> 		    [ bytemode ] [ quantum BYTES ]
> 		    [ ecn | ecn_prob PERCENTAGE ]
>
> defaults: 1024 flows, 10240 packets limit, quantum : device MTU
>            target: 15ms
>            tupdate: 15ms
>            alpha: 2 (on a scale of 0 to 16)
>            beta: 20 (on a scale of 0 to 16)
>            ecn: false
>            ecn_prob: 10%
>
> Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> Signed-off-by: Sachin D. Patil <sdp.sachin@gmail.com>
> Signed-off-by: Mohit Bhasi <mohitbhasi1998@gmail.com>
> Signed-off-by: V. Saicharan <vsaicharan1998@gmail.com>
> Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
> Signed-off-by: Gautam Ramakrishnan <gautamramk@gmail.com>
> Cc: Dave Taht <dave.taht@gmail.com>
> ---
>  include/uapi/linux/pkt_sched.h |  28 ++
>  net/sched/Kconfig              |  14 +-
>  net/sched/Makefile             |   1 +
>  net/sched/sch_fq_pie.c         | 485 +++++++++++++++++++++++++++++++++
>  4 files changed, 527 insertions(+), 1 deletion(-)
>  create mode 100644 net/sched/sch_fq_pie.c
>
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 7ee74c3474bf..005413bd09ee 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -964,6 +964,34 @@ struct tc_pie_xstats {
>  	__u32 ecn_mark;         /* packets marked with ecn*/
>  };
>  
> +/* FQ PIE */
> +enum {
> +	TCA_FQ_PIE_UNSPEC,
> +	TCA_FQ_PIE_TARGET,
> +	TCA_FQ_PIE_LIMIT,
> +	TCA_FQ_PIE_TUPDATE,
> +	TCA_FQ_PIE_ALPHA,
> +	TCA_FQ_PIE_BETA,
> +	TCA_FQ_PIE_ECN,
> +	TCA_FQ_PIE_QUANTUM,
> +	TCA_FQ_PIE_BYTEMODE,
> +	TCA_FQ_PIE_FLOWS,
> +	TCA_FQ_PIE_ECN_PROB,
> +	__TCA_FQ_PIE_MAX
> +};
> +#define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
> +
> +struct tc_fq_pie_xstats {
> +	__u32 packets_in;       /* total number of packets enqueued */
> +	__u32 dropped;          /* packets dropped due to fq_pie_action */
> +	__u32 overlimit;        /* dropped due to lack of space in queue */
> +	__u32 ecn_mark;         /* packets marked with ecn*/
> +	__u32 new_flow_count;   /* number of time packets
> +				   created a 'new flow' */
> +	__u32 new_flows_len;	/* count of flows in new list */
> +	__u32 old_flows_len;	/* count of flows in old list */
> +};
> +
>  /* CBS */
>  struct tc_cbs_qopt {
>  	__u8 offload;
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 5c02ad97ef23..49f4dd9894a0 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -358,13 +358,25 @@ config NET_SCH_PIE
>  	help
>  	  Say Y here if you want to use the Proportional Integral controller
>  	  Enhanced scheduler packet scheduling algorithm.
> -	  For more information, please see https://tools.ietf.org/html/rfc8033
> +	  For more information, please see
> +	  http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
>  
>  	  To compile this driver as a module, choose M here: the module
>  	  will be called sch_pie.
>  
>  	  If unsure, say N.
>  
> +config NET_SCH_FQ_PIE
> +	tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler"
> +	help
> +	  Say Y here if you want to use the Flow Queue Proportional Integral controller
> +	  Enhanced scheduler packet scheduling algorithm.
> +
> +	  To compile this driver as a module, choose M here: the module
> +	  will be called sch_fq_pie.
> +
> +	  If unsure, say N.
> +
>  config NET_SCH_INGRESS
>  	tristate "Ingress/classifier-action Qdisc"
>  	depends on NET_CLS_ACT
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index 8a40431d7b5c..fdcd3f7b2fb2 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -55,6 +55,7 @@ obj-$(CONFIG_NET_SCH_CAKE)	+= sch_cake.o
>  obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
>  obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
>  obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
> +obj-$(CONFIG_NET_SCH_FQ_PIE)    += sch_fq_pie.o
>  obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
>  obj-$(CONFIG_NET_SCH_ETF)	+= sch_etf.o
>  obj-$(CONFIG_NET_SCH_TAPRIO)	+= sch_taprio.o
> diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
> new file mode 100644
> index 000000000000..4ccefa0bc7f0
> --- /dev/null
> +++ b/net/sched/sch_fq_pie.c
> @@ -0,0 +1,485 @@
> +/*
> + * net/sched/sch_fq_pie.c
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.

Lose the license boilerplate and replace it with a SPDX header line.

> + * Author: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> + * Author: Sachin D. Patil <sdp.sachin@gmail.com>
> + * Author: Mohit Bhasi <mohitbhasi1998@gmail.com>
> + * Author: V Saicharan <vsaicharan1998@gmail.com>
> + * Author: Leslie Monis <lesliemonis@gmail.com>
> + * Author: Gautam Ramakrishnan <gautamramk@gmail.com>
> + *
> + * References:
> + * RFC 8033: https://tools.ietf.org/html/rfc8033
> + */
> +
> +#include <linux/jhash.h>
> +#include <linux/vmalloc.h>
> +#include <net/pie.h>
> +
> +struct fq_pie_params {
> +	struct pie_params p_params;
> +	u32 ecn_prob;
> +	u32 flows_cnt;
> +};
> +
> +struct fq_pie_stats {
> +	u32 packets_in;		/* total number of packets enqueued */
> +	u32 dropped;		/* packets dropped due to fq_pie action */
> +	u32 overlimit;		/* dropped due to lack of space in queue */
> +	u32 ecn_mark;		/* packets marked with ECN */
> +	u32 new_flow_count;	/* number of time packets created a new flow */
> +};
> +
> +struct fq_pie_flow {
> +	s32 deficit;		/* number of credits remaining for the flow */
> +	u32 backlog;		/* size of data in the flow */
> +	u32 qlen;		/* number of packets in the flow */
> +	struct sk_buff *head;
> +	struct sk_buff *tail;
> +	struct list_head flowchain;
> +	struct pie_vars vars;	/* pie vars for the flow */
> +	struct pie_stats stats;	/* pie stats for the flow */
> +};
> +
> +struct fq_pie_sched_data {
> +	u32 quantum;		/* number of credits in deficit round robin */
> +	struct fq_pie_flow *flows;
> +	struct Qdisc *sch;
> +	struct fq_pie_params params;
> +	struct fq_pie_stats stats;
> +	struct list_head old_flows;
> +	struct list_head new_flows;
> +	struct timer_list adapt_timer;
> +};

The flow and sched_data structs have quite a bit in common with those in
fq_codel; but the members are in a different order.

> +static void fq_pie_params_init(struct fq_pie_params *params)
> +{
> +	pie_params_init(&params->p_params);
> +	params->ecn_prob = 10;
> +	params->flows_cnt = 1024;
> +}
> +
> +static inline void flow_queue_add(struct fq_pie_flow *flow,
> +				  struct sk_buff *skb)
> +{
> +	if (!flow->head)
> +		flow->head = skb;
> +	else
> +		flow->tail->next = skb;
> +	flow->tail = skb;
> +	skb->next = NULL;
> +}
> +
> +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> +				struct sk_buff **to_free)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	struct fq_pie_flow *sel_flow;
> +	u32 pkt_len;
> +	u32 idx;
> +	u8 enqueue = false;
> +
> +	/* Classifies packet into corresponding flow */
> +	idx = reciprocal_scale(skb_get_hash(skb), q->params.flows_cnt);
> +	sel_flow = &q->flows[idx];

This is missing the ability to override the classification from tc
filters. See fq_codel_classify().

> +
> +	/* Checks if the qdisc is full */
> +	if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
> +		q->stats.overlimit++;
> +		sel_flow->stats.overlimit++;
> +		goto out;
> +	}

The memory_limit checks in fq_codel have turned out to be quite useful
on constrained systems. I'd suggest adding them here as well.

> +
> +	if (!drop_early(sch, sel_flow->backlog, skb->len, &sel_flow->vars,
> +			&q->params.p_params)) {
> +		enqueue = true;
> +	} else if (q->params.p_params.ecn &&
> +		   sel_flow->vars.prob <=
> +		   (MAX_PROB / 100) * q->params.ecn_prob &&
> +		   INET_ECN_set_ce(skb)) {
> +		/* If packet is ecn capable, mark it if drop probability
> +		 * is lower than the parameter ecn_prob, else drop it.
> +		 */
> +		q->stats.ecn_mark++;
> +		sel_flow->stats.ecn_mark++;
> +		enqueue = true;
> +	}
> +	if (enqueue) {
> +		pkt_len = qdisc_pkt_len(skb);
> +		q->stats.packets_in++;
> +		sch->qstats.backlog += pkt_len;
> +		sch->q.qlen++;
> +		flow_queue_add(sel_flow, skb);
> +		if (list_empty(&sel_flow->flowchain)) {
> +			list_add_tail(&sel_flow->flowchain, &q->new_flows);
> +			q->stats.new_flow_count++;
> +			sel_flow->deficit = q->quantum;
> +			sel_flow->stats.dropped = 0;
> +			sel_flow->qlen = 0;
> +			sel_flow->backlog = 0;
> +		}
> +		sel_flow->qlen++;
> +		sel_flow->stats.packets_in++;
> +		sel_flow->backlog += pkt_len;
> +		return NET_XMIT_SUCCESS;
> +	}
> +out:
> +	q->stats.dropped++;
> +	sel_flow->stats.dropped++;
> +	return qdisc_drop(skb, sch, to_free);

You probably want to return NET_XMIT_CN instead of NET_XMIT_DROP here.

> +}
> +
> +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
> +	[TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_BETA] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_ECN] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
> +	[TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}
> +};
> +
> +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
> +{
> +	struct sk_buff *skb = flow->head;
> +
> +	flow->head = skb->next;
> +	skb->next = NULL;
> +	return skb;
> +}
> +
> +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	struct sk_buff *skb = NULL;
> +	struct fq_pie_flow *flow;
> +	struct list_head *head;
> +	u32 pkt_len;
> +
> +begin:
> +	head = &q->new_flows;
> +	if (list_empty(head)) {
> +		head = &q->old_flows;
> +		if (list_empty(head))
> +			return NULL;
> +	}
> +
> +	flow = list_first_entry(head, struct fq_pie_flow, flowchain);
> +	/* Flow has exhausted all its credits */
> +	if (flow->deficit <= 0) {
> +		flow->deficit += q->quantum;
> +		list_move_tail(&flow->flowchain, &q->old_flows);
> +		goto begin;
> +	}
> +
> +	if (flow->head) {
> +		skb = dequeue_head(flow);
> +		pkt_len = qdisc_pkt_len(skb);
> +		sch->qstats.backlog -= pkt_len;
> +		sch->q.qlen--;
> +		qdisc_bstats_update(sch, skb);
> +	}

If you factor this out into a dequeue_func(), this dequeue() function is
very close to identical to the one in fq_codel().

> +	if (!skb) {
> +		if (head == &q->new_flows && !list_empty(&q->old_flows))
> +			list_move_tail(&flow->flowchain, &q->old_flows);
> +		else
> +			list_del_init(&flow->flowchain);
> +		goto begin;
> +	}
> +
> +	flow->qlen--;
> +	flow->deficit -= pkt_len;
> +	flow->backlog -= pkt_len;
> +	pie_process_dequeue(flow->backlog, skb, &flow->vars);
> +	return skb;
> +}
> +
> +static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
> +			 struct netlink_ext_ack *extack)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
> +	unsigned int len_dropped = 0;
> +	unsigned int num_dropped = 0;
> +	unsigned int qlen;
> +	int err;
> +
> +	if (!opt)
> +		return -EINVAL;
> +
> +	err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, NULL);
> +	if (err < 0)
> +		return err;
> +
> +	if (tb[TCA_FQ_PIE_FLOWS]) {
> +		if (q->flows)
> +			return -EINVAL;
> +		q->params.flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
> +		if (!q->params.flows_cnt ||
> +		    q->params.flows_cnt > 65536)
> +			return -EINVAL;
> +	}
> +
> +	sch_tree_lock(sch);
> +
> +	/* convert from microseconds to pschedtime */
> +	if (tb[TCA_FQ_PIE_TARGET]) {
> +		/* target is in us */
> +		u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
> +
> +		/* convert to pschedtime */
> +		q->params.p_params.target =
> +		PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
> +	}
> +
> +	/* tupdate is in jiffies */
> +	if (tb[TCA_FQ_PIE_TUPDATE])
> +		q->params.p_params.tupdate =
> +		usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
> +
> +	if (tb[TCA_FQ_PIE_LIMIT]) {
> +		u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
> +
> +		q->params.p_params.limit = limit;
> +		sch->limit = limit;
> +	}
> +
> +	if (tb[TCA_FQ_PIE_ECN_PROB])
> +		q->params.ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
> +
> +	if (tb[TCA_FQ_PIE_ALPHA])
> +		q->params.p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
> +
> +	if (tb[TCA_FQ_PIE_BETA])
> +		q->params.p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
> +
> +	if (tb[TCA_FQ_PIE_ECN])
> +		q->params.p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
> +
> +	if (tb[TCA_FQ_PIE_QUANTUM])
> +		q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
> +
> +	if (tb[TCA_FQ_PIE_BYTEMODE])
> +		q->params.p_params.bytemode =
> +		nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
> +
> +	/* Drop excess packets if new limit is lower */
> +	qlen = sch->q.qlen;
> +	while (sch->q.qlen > sch->limit) {
> +		struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
> +
> +		kfree_skb(skb);
> +		len_dropped += qdisc_pkt_len(skb);
> +		num_dropped += 1;
> +	}
> +	qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
> +
> +	sch_tree_unlock(sch);
> +	return 0;
> +}
> +
> +static void fq_pie_timer(struct timer_list *t)
> +{
> +	struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
> +	struct Qdisc *sch = q->sch;
> +	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
> +	u16 idx;
> +
> +	spin_lock(root_lock);
> +
> +	for (idx = 0; idx < q->params.flows_cnt; idx++)
> +		calculate_probability(q->flows[idx].backlog,
> +				      &q->flows[idx].vars,
> +				      &q->params.p_params);
> +
> +	// reset the timer to fire after 'tupdate'. tupdate is in jiffies.
> +	if (q->params.p_params.tupdate)
> +		mod_timer(&q->adapt_timer,
> +			  jiffies + q->params.p_params.tupdate);
> +	spin_unlock(root_lock);
> +}
> +
> +static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
> +		       struct netlink_ext_ack *extack)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	int err;
> +	int i;
> +
> +	fq_pie_params_init(&q->params);
> +	sch->limit = 10 * 1024;
> +	q->params.p_params.limit = sch->limit;
> +	q->quantum = psched_mtu(qdisc_dev(sch));
> +	q->sch = sch;
> +
> +	INIT_LIST_HEAD(&q->new_flows);
> +	INIT_LIST_HEAD(&q->old_flows);
> +
> +	timer_setup(&q->adapt_timer, fq_pie_timer, 0);
> +	mod_timer(&q->adapt_timer, jiffies + HZ / 2);
> +
> +	if (opt) {
> +		int err = fq_pie_change(sch, opt, extack);
> +
> +		if (err)
> +			return err;
> +	}
> +
> +	if (!q->flows) {
> +		q->flows = kvcalloc(q->params.flows_cnt,
> +				    sizeof(struct fq_pie_flow),
> +				    GFP_KERNEL);
> +		if (!q->flows) {
> +			err = -ENOMEM;
> +			goto init_failure;
> +		}
> +		for (i = 0; i < q->params.flows_cnt; i++) {
> +			struct fq_pie_flow *flow = q->flows + i;
> +
> +			INIT_LIST_HEAD(&flow->flowchain);
> +			pie_vars_init(&flow->vars);
> +		}
> +	}
> +	return 0;
> +
> +init_failure:
> +	q->params.flows_cnt = 0;
> +
> +	return err;
> +}
> +
> +static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	struct nlattr *opts;
> +
> +	opts = nla_nest_start(skb, TCA_OPTIONS);
> +	if (!opts)
> +		goto nla_put_failure;
> +
> +	/* convert target from pschedtime to us */
> +	if (nla_put_u32(skb, TCA_FQ_PIE_TARGET,
> +			((u32)PSCHED_TICKS2NS(q->params.p_params.target)) /
> +			NSEC_PER_USEC) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
> +			jiffies_to_usecs(q->params.p_params.tupdate)) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->params.p_params.alpha) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_BETA, q->params.p_params.beta) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_ECN, q->params.p_params.ecn) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE,
> +			q->params.p_params.bytemode) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->params.flows_cnt) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->params.ecn_prob))
> +		goto nla_put_failure;
> +
> +	return nla_nest_end(skb, opts);
> +
> +nla_put_failure:
> +	return -1;
> +}
> +
> +static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	struct tc_fq_pie_xstats st = {
> +		.packets_in	= q->stats.packets_in,
> +		.overlimit	= q->stats.overlimit,
> +		.dropped	= q->stats.dropped,
> +		.ecn_mark	= q->stats.ecn_mark,
> +		.new_flow_count = q->stats.new_flow_count,
> +	};
> +	struct list_head *pos;
> +
> +	sch_tree_lock(sch);
> +	list_for_each(pos, &q->new_flows)
> +		st.new_flows_len++;
> +
> +	list_for_each(pos, &q->old_flows)
> +		st.old_flows_len++;
> +	sch_tree_unlock(sch);
> +
> +	return gnet_stats_copy_app(d, &st, sizeof(st));
> +}
> +
> +static void fq_pie_reset(struct Qdisc *sch)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	u16 idx;
> +
> +	INIT_LIST_HEAD(&q->new_flows);
> +	INIT_LIST_HEAD(&q->old_flows);
> +	for (idx = 0; idx < q->params.flows_cnt; idx++) {
> +		struct fq_pie_flow *flow = q->flows + idx;
> +
> +		/* Removes all packets from flow */
> +		rtnl_kfree_skbs(flow->head, flow->tail);
> +		flow->head = NULL;
> +
> +		INIT_LIST_HEAD(&flow->flowchain);
> +		pie_vars_init(&flow->vars);
> +	}
> +
> +	sch->q.qlen = 0;
> +	sch->qstats.backlog = 0;
> +}
> +
> +static void fq_pie_destroy(struct Qdisc *sch)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +
> +	kvfree(q->flows);
> +	del_timer_sync(&q->adapt_timer);
> +}
> +
> +static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
> +	.id = "fq_pie",
> +	.priv_size	= sizeof(struct fq_pie_sched_data),
> +	.enqueue	= fq_pie_qdisc_enqueue,
> +	.dequeue	= fq_pie_qdisc_dequeue,
> +	.peek		= qdisc_peek_dequeued,
> +	.init		= fq_pie_init,
> +	.destroy	= fq_pie_destroy,
> +	.reset		= fq_pie_reset,
> +	.change		= fq_pie_change,
> +	.dump		= fq_pie_dump,
> +	.dump_stats	= fq_pie_dump_stats,
> +	.owner		= THIS_MODULE,
> +};
> +
> +static int __init fq_pie_module_init(void)
> +{
> +	return register_qdisc(&fq_pie_qdisc_ops);
> +}
> +
> +static void __exit fq_pie_module_exit(void)
> +{
> +	unregister_qdisc(&fq_pie_qdisc_ops);
> +}
> +
> +module_init(fq_pie_module_init);
> +module_exit(fq_pie_module_exit);
> +
> +MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler");
> +MODULE_AUTHOR("Mohit P. Tahiliani");
> +MODULE_AUTHOR("Sachin D. Patil");
> +MODULE_AUTHOR("Mohit Bhasi");
> +MODULE_AUTHOR("V Saicharan");
> +MODULE_AUTHOR("Leslie Monis");
> +MODULE_AUTHOR("Gautam Ramakrishnan");
> +MODULE_LICENSE("GPL");
> -- 
> 2.17.1
Gautam Ramakrishnan April 2, 2019, 3:57 p.m. UTC | #2
Hello, thanks for the feedback

On Tue, Apr 2, 2019 at 4:19 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Some suggestions below to make fq_pie and fq_codel more similar (ref. my
> previous email).
>
> Also, a few unrelated nits.
>
> > From: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> >
> > FQ-PIE incorporates fair/flow queuing in which every queue
> > is managed by an instance of PIE queueing discipline.
> > The algorithm provides good control over the queueing delay
> > while at the same time, ensures fairness across various
> > flows sharing the same link.
> >
> > Principles:
> >   - Packets are classified stochastically on flows.
> >   - Each flow has a PIE managed queue.
> >   - Flows are linked onto two (Round Robin) lists,
> >     so that new flows have priority on old ones.
> >   - For a given flow, packets are dropped on arrival according
> >     to a drop probability.
> >   - Tail drops only.
>
> Why tail drops only?

I had mentioned this because packets are dropped only at enqueue.

>
> > Usage:
> > tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ]
> >                     [ alpha NUMBER ] [ beta NUMBER ]
> >                     [ target TIME us ] [ tupdate TIME us ]
> >                   [ bytemode ] [ quantum BYTES ]
> >                   [ ecn | ecn_prob PERCENTAGE ]
> >
> > defaults: 1024 flows, 10240 packets limit, quantum : device MTU
> >            target: 15ms
> >            tupdate: 15ms
> >            alpha: 2 (on a scale of 0 to 16)
> >            beta: 20 (on a scale of 0 to 16)
> >            ecn: false
> >            ecn_prob: 10%
> >
> > Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> > Signed-off-by: Sachin D. Patil <sdp.sachin@gmail.com>
> > Signed-off-by: Mohit Bhasi <mohitbhasi1998@gmail.com>
> > Signed-off-by: V. Saicharan <vsaicharan1998@gmail.com>
> > Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
> > Signed-off-by: Gautam Ramakrishnan <gautamramk@gmail.com>
> > Cc: Dave Taht <dave.taht@gmail.com>
> > ---
> >  include/uapi/linux/pkt_sched.h |  28 ++
> >  net/sched/Kconfig              |  14 +-
> >  net/sched/Makefile             |   1 +
> >  net/sched/sch_fq_pie.c         | 485 +++++++++++++++++++++++++++++++++
> >  4 files changed, 527 insertions(+), 1 deletion(-)
> >  create mode 100644 net/sched/sch_fq_pie.c
> >
> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> > index 7ee74c3474bf..005413bd09ee 100644
> > --- a/include/uapi/linux/pkt_sched.h
> > +++ b/include/uapi/linux/pkt_sched.h
> > @@ -964,6 +964,34 @@ struct tc_pie_xstats {
> >       __u32 ecn_mark;         /* packets marked with ecn*/
> >  };
> >
> > +/* FQ PIE */
> > +enum {
> > +     TCA_FQ_PIE_UNSPEC,
> > +     TCA_FQ_PIE_TARGET,
> > +     TCA_FQ_PIE_LIMIT,
> > +     TCA_FQ_PIE_TUPDATE,
> > +     TCA_FQ_PIE_ALPHA,
> > +     TCA_FQ_PIE_BETA,
> > +     TCA_FQ_PIE_ECN,
> > +     TCA_FQ_PIE_QUANTUM,
> > +     TCA_FQ_PIE_BYTEMODE,
> > +     TCA_FQ_PIE_FLOWS,
> > +     TCA_FQ_PIE_ECN_PROB,
> > +     __TCA_FQ_PIE_MAX
> > +};
> > +#define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
> > +
> > +struct tc_fq_pie_xstats {
> > +     __u32 packets_in;       /* total number of packets enqueued */
> > +     __u32 dropped;          /* packets dropped due to fq_pie_action */
> > +     __u32 overlimit;        /* dropped due to lack of space in queue */
> > +     __u32 ecn_mark;         /* packets marked with ecn*/
> > +     __u32 new_flow_count;   /* number of time packets
> > +                                created a 'new flow' */
> > +     __u32 new_flows_len;    /* count of flows in new list */
> > +     __u32 old_flows_len;    /* count of flows in old list */
> > +};
> > +
> >  /* CBS */
> >  struct tc_cbs_qopt {
> >       __u8 offload;
> > diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> > index 5c02ad97ef23..49f4dd9894a0 100644
> > --- a/net/sched/Kconfig
> > +++ b/net/sched/Kconfig
> > @@ -358,13 +358,25 @@ config NET_SCH_PIE
> >       help
> >         Say Y here if you want to use the Proportional Integral controller
> >         Enhanced scheduler packet scheduling algorithm.
> > -       For more information, please see https://tools.ietf.org/html/rfc8033
> > +       For more information, please see
> > +       http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
> >
> >         To compile this driver as a module, choose M here: the module
> >         will be called sch_pie.
> >
> >         If unsure, say N.
> >
> > +config NET_SCH_FQ_PIE
> > +     tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler"
> > +     help
> > +       Say Y here if you want to use the Flow Queue Proportional Integral controller
> > +       Enhanced scheduler packet scheduling algorithm.
> > +
> > +       To compile this driver as a module, choose M here: the module
> > +       will be called sch_fq_pie.
> > +
> > +       If unsure, say N.
> > +
> >  config NET_SCH_INGRESS
> >       tristate "Ingress/classifier-action Qdisc"
> >       depends on NET_CLS_ACT
> > diff --git a/net/sched/Makefile b/net/sched/Makefile
> > index 8a40431d7b5c..fdcd3f7b2fb2 100644
> > --- a/net/sched/Makefile
> > +++ b/net/sched/Makefile
> > @@ -55,6 +55,7 @@ obj-$(CONFIG_NET_SCH_CAKE)  += sch_cake.o
> >  obj-$(CONFIG_NET_SCH_FQ)     += sch_fq.o
> >  obj-$(CONFIG_NET_SCH_HHF)    += sch_hhf.o
> >  obj-$(CONFIG_NET_SCH_PIE)    += sch_pie.o
> > +obj-$(CONFIG_NET_SCH_FQ_PIE)    += sch_fq_pie.o
> >  obj-$(CONFIG_NET_SCH_CBS)    += sch_cbs.o
> >  obj-$(CONFIG_NET_SCH_ETF)    += sch_etf.o
> >  obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
> > diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
> > new file mode 100644
> > index 000000000000..4ccefa0bc7f0
> > --- /dev/null
> > +++ b/net/sched/sch_fq_pie.c
> > @@ -0,0 +1,485 @@
> > +/*
> > + * net/sched/sch_fq_pie.c
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; either version 2
> > + * of the License.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
>
> Lose the license boilerplate and replace it with a SPDX header line.

I shall do that.

>
> > + * Author: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> > + * Author: Sachin D. Patil <sdp.sachin@gmail.com>
> > + * Author: Mohit Bhasi <mohitbhasi1998@gmail.com>
> > + * Author: V Saicharan <vsaicharan1998@gmail.com>
> > + * Author: Leslie Monis <lesliemonis@gmail.com>
> > + * Author: Gautam Ramakrishnan <gautamramk@gmail.com>
> > + *
> > + * References:
> > + * RFC 8033: https://tools.ietf.org/html/rfc8033
> > + */
> > +
> > +#include <linux/jhash.h>
> > +#include <linux/vmalloc.h>
> > +#include <net/pie.h>
> > +
> > +struct fq_pie_params {
> > +     struct pie_params p_params;
> > +     u32 ecn_prob;
> > +     u32 flows_cnt;
> > +};
> > +
> > +struct fq_pie_stats {
> > +     u32 packets_in;         /* total number of packets enqueued */
> > +     u32 dropped;            /* packets dropped due to fq_pie action */
> > +     u32 overlimit;          /* dropped due to lack of space in queue */
> > +     u32 ecn_mark;           /* packets marked with ECN */
> > +     u32 new_flow_count;     /* number of time packets created a new flow */
> > +};
> > +
> > +struct fq_pie_flow {
> > +     s32 deficit;            /* number of credits remaining for the flow */
> > +     u32 backlog;            /* size of data in the flow */
> > +     u32 qlen;               /* number of packets in the flow */
> > +     struct sk_buff *head;
> > +     struct sk_buff *tail;
> > +     struct list_head flowchain;
> > +     struct pie_vars vars;   /* pie vars for the flow */
> > +     struct pie_stats stats; /* pie stats for the flow */
> > +};
> > +
> > +struct fq_pie_sched_data {
> > +     u32 quantum;            /* number of credits in deficit round robin */
> > +     struct fq_pie_flow *flows;
> > +     struct Qdisc *sch;
> > +     struct fq_pie_params params;
> > +     struct fq_pie_stats stats;
> > +     struct list_head old_flows;
> > +     struct list_head new_flows;
> > +     struct timer_list adapt_timer;
> > +};
>
> The flow and sched_data structs have quite a bit in common with those in
> fq_codel; but the members are in a different order.
>
> > +static void fq_pie_params_init(struct fq_pie_params *params)
> > +{
> > +     pie_params_init(&params->p_params);
> > +     params->ecn_prob = 10;
> > +     params->flows_cnt = 1024;
> > +}
> > +
> > +static inline void flow_queue_add(struct fq_pie_flow *flow,
> > +                               struct sk_buff *skb)
> > +{
> > +     if (!flow->head)
> > +             flow->head = skb;
> > +     else
> > +             flow->tail->next = skb;
> > +     flow->tail = skb;
> > +     skb->next = NULL;
> > +}
> > +
> > +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> > +                             struct sk_buff **to_free)
> > +{
> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > +     struct fq_pie_flow *sel_flow;
> > +     u32 pkt_len;
> > +     u32 idx;
> > +     u8 enqueue = false;
> > +
> > +     /* Classifies packet into corresponding flow */
> > +     idx = reciprocal_scale(skb_get_hash(skb), q->params.flows_cnt);
> > +     sel_flow = &q->flows[idx];
>
> This is missing the ability to override the classification from tc
> filters. See fq_codel_classify().

I wanted to keep it simple initially. Shall add that.

>
> > +
> > +     /* Checks if the qdisc is full */
> > +     if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
> > +             q->stats.overlimit++;
> > +             sel_flow->stats.overlimit++;
> > +             goto out;
> > +     }
>
> The memory_limit checks in fq_codel have turned out to be quite useful
> on constrained systems. I'd suggest adding them here as well.

I shall add that too.

>
> > +
> > +     if (!drop_early(sch, sel_flow->backlog, skb->len, &sel_flow->vars,
> > +                     &q->params.p_params)) {
> > +             enqueue = true;
> > +     } else if (q->params.p_params.ecn &&
> > +                sel_flow->vars.prob <=
> > +                (MAX_PROB / 100) * q->params.ecn_prob &&
> > +                INET_ECN_set_ce(skb)) {
> > +             /* If packet is ecn capable, mark it if drop probability
> > +              * is lower than the parameter ecn_prob, else drop it.
> > +              */
> > +             q->stats.ecn_mark++;
> > +             sel_flow->stats.ecn_mark++;
> > +             enqueue = true;
> > +     }
> > +     if (enqueue) {
> > +             pkt_len = qdisc_pkt_len(skb);
> > +             q->stats.packets_in++;
> > +             sch->qstats.backlog += pkt_len;
> > +             sch->q.qlen++;
> > +             flow_queue_add(sel_flow, skb);
> > +             if (list_empty(&sel_flow->flowchain)) {
> > +                     list_add_tail(&sel_flow->flowchain, &q->new_flows);
> > +                     q->stats.new_flow_count++;
> > +                     sel_flow->deficit = q->quantum;
> > +                     sel_flow->stats.dropped = 0;
> > +                     sel_flow->qlen = 0;
> > +                     sel_flow->backlog = 0;
> > +             }
> > +             sel_flow->qlen++;
> > +             sel_flow->stats.packets_in++;
> > +             sel_flow->backlog += pkt_len;
> > +             return NET_XMIT_SUCCESS;
> > +     }
> > +out:
> > +     q->stats.dropped++;
> > +     sel_flow->stats.dropped++;
> > +     return qdisc_drop(skb, sch, to_free);
>
> You probably want to return NET_XMIT_CN instead of NET_XMIT_DROP here.
>

Should I replace qdisc_drop with __qdisc_drop and return NET_XMIT_CN?

> > +}
> > +
> > +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
> > +     [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
> > +     [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}
> > +};
> > +
> > +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
> > +{
> > +     struct sk_buff *skb = flow->head;
> > +
> > +     flow->head = skb->next;
> > +     skb->next = NULL;
> > +     return skb;
> > +}
> > +
> > +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
> > +{
> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > +     struct sk_buff *skb = NULL;
> > +     struct fq_pie_flow *flow;
> > +     struct list_head *head;
> > +     u32 pkt_len;
> > +
> > +begin:
> > +     head = &q->new_flows;
> > +     if (list_empty(head)) {
> > +             head = &q->old_flows;
> > +             if (list_empty(head))
> > +                     return NULL;
> > +     }
> > +
> > +     flow = list_first_entry(head, struct fq_pie_flow, flowchain);
> > +     /* Flow has exhausted all its credits */
> > +     if (flow->deficit <= 0) {
> > +             flow->deficit += q->quantum;
> > +             list_move_tail(&flow->flowchain, &q->old_flows);
> > +             goto begin;
> > +     }
> > +
> > +     if (flow->head) {
> > +             skb = dequeue_head(flow);
> > +             pkt_len = qdisc_pkt_len(skb);
> > +             sch->qstats.backlog -= pkt_len;
> > +             sch->q.qlen--;
> > +             qdisc_bstats_update(sch, skb);
> > +     }
>
> If you factor this out into a dequeue_func(), this dequeue() function is
> very close to identical to the one in fq_codel().

Do you suggest I put the if(flow->head) block in a different function?

>
> > +     if (!skb) {
> > +             if (head == &q->new_flows && !list_empty(&q->old_flows))
> > +                     list_move_tail(&flow->flowchain, &q->old_flows);
> > +             else
> > +                     list_del_init(&flow->flowchain);
> > +             goto begin;
> > +     }
> > +
> > +     flow->qlen--;
> > +     flow->deficit -= pkt_len;
> > +     flow->backlog -= pkt_len;
> > +     pie_process_dequeue(flow->backlog, skb, &flow->vars);
> > +     return skb;
> > +}
> > +
> > +static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
> > +                      struct netlink_ext_ack *extack)
> > +{
> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > +     struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
> > +     unsigned int len_dropped = 0;
> > +     unsigned int num_dropped = 0;
> > +     unsigned int qlen;
> > +     int err;
> > +
> > +     if (!opt)
> > +             return -EINVAL;
> > +
> > +     err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, NULL);
> > +     if (err < 0)
> > +             return err;
> > +
> > +     if (tb[TCA_FQ_PIE_FLOWS]) {
> > +             if (q->flows)
> > +                     return -EINVAL;
> > +             q->params.flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
> > +             if (!q->params.flows_cnt ||
> > +                 q->params.flows_cnt > 65536)
> > +                     return -EINVAL;
> > +     }
> > +
> > +     sch_tree_lock(sch);
> > +
> > +     /* convert from microseconds to pschedtime */
> > +     if (tb[TCA_FQ_PIE_TARGET]) {
> > +             /* target is in us */
> > +             u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
> > +
> > +             /* convert to pschedtime */
> > +             q->params.p_params.target =
> > +             PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
> > +     }
> > +
> > +     /* tupdate is in jiffies */
> > +     if (tb[TCA_FQ_PIE_TUPDATE])
> > +             q->params.p_params.tupdate =
> > +             usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
> > +
> > +     if (tb[TCA_FQ_PIE_LIMIT]) {
> > +             u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
> > +
> > +             q->params.p_params.limit = limit;
> > +             sch->limit = limit;
> > +     }
> > +
> > +     if (tb[TCA_FQ_PIE_ECN_PROB])
> > +             q->params.ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
> > +
> > +     if (tb[TCA_FQ_PIE_ALPHA])
> > +             q->params.p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
> > +
> > +     if (tb[TCA_FQ_PIE_BETA])
> > +             q->params.p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
> > +
> > +     if (tb[TCA_FQ_PIE_ECN])
> > +             q->params.p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
> > +
> > +     if (tb[TCA_FQ_PIE_QUANTUM])
> > +             q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
> > +
> > +     if (tb[TCA_FQ_PIE_BYTEMODE])
> > +             q->params.p_params.bytemode =
> > +             nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
> > +
> > +     /* Drop excess packets if new limit is lower */
> > +     qlen = sch->q.qlen;
> > +     while (sch->q.qlen > sch->limit) {
> > +             struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
> > +
> > +             kfree_skb(skb);
> > +             len_dropped += qdisc_pkt_len(skb);
> > +             num_dropped += 1;
> > +     }
> > +     qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
> > +
> > +     sch_tree_unlock(sch);
> > +     return 0;
> > +}
> > +
> > +static void fq_pie_timer(struct timer_list *t)
> > +{
> > +     struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
> > +     struct Qdisc *sch = q->sch;
> > +     spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
> > +     u16 idx;
> > +
> > +     spin_lock(root_lock);
> > +
> > +     for (idx = 0; idx < q->params.flows_cnt; idx++)
> > +             calculate_probability(q->flows[idx].backlog,
> > +                                   &q->flows[idx].vars,
> > +                                   &q->params.p_params);
> > +
> > +     // reset the timer to fire after 'tupdate'. tupdate is in jiffies.
> > +     if (q->params.p_params.tupdate)
> > +             mod_timer(&q->adapt_timer,
> > +                       jiffies + q->params.p_params.tupdate);
> > +     spin_unlock(root_lock);
> > +}
> > +
> > +static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
> > +                    struct netlink_ext_ack *extack)
> > +{
> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > +     int err;
> > +     int i;
> > +
> > +     fq_pie_params_init(&q->params);
> > +     sch->limit = 10 * 1024;
> > +     q->params.p_params.limit = sch->limit;
> > +     q->quantum = psched_mtu(qdisc_dev(sch));
> > +     q->sch = sch;
> > +
> > +     INIT_LIST_HEAD(&q->new_flows);
> > +     INIT_LIST_HEAD(&q->old_flows);
> > +
> > +     timer_setup(&q->adapt_timer, fq_pie_timer, 0);
> > +     mod_timer(&q->adapt_timer, jiffies + HZ / 2);
> > +
> > +     if (opt) {
> > +             int err = fq_pie_change(sch, opt, extack);
> > +
> > +             if (err)
> > +                     return err;
> > +     }
> > +
> > +     if (!q->flows) {
> > +             q->flows = kvcalloc(q->params.flows_cnt,
> > +                                 sizeof(struct fq_pie_flow),
> > +                                 GFP_KERNEL);
> > +             if (!q->flows) {
> > +                     err = -ENOMEM;
> > +                     goto init_failure;
> > +             }
> > +             for (i = 0; i < q->params.flows_cnt; i++) {
> > +                     struct fq_pie_flow *flow = q->flows + i;
> > +
> > +                     INIT_LIST_HEAD(&flow->flowchain);
> > +                     pie_vars_init(&flow->vars);
> > +             }
> > +     }
> > +     return 0;
> > +
> > +init_failure:
> > +     q->params.flows_cnt = 0;
> > +
> > +     return err;
> > +}
> > +
> > +static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
> > +{
> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > +     struct nlattr *opts;
> > +
> > +     opts = nla_nest_start(skb, TCA_OPTIONS);
> > +     if (!opts)
> > +             goto nla_put_failure;
> > +
> > +     /* convert target from pschedtime to us */
> > +     if (nla_put_u32(skb, TCA_FQ_PIE_TARGET,
> > +                     ((u32)PSCHED_TICKS2NS(q->params.p_params.target)) /
> > +                     NSEC_PER_USEC) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
> > +                     jiffies_to_usecs(q->params.p_params.tupdate)) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->params.p_params.alpha) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_BETA, q->params.p_params.beta) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_ECN, q->params.p_params.ecn) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE,
> > +                     q->params.p_params.bytemode) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->params.flows_cnt) ||
> > +         nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->params.ecn_prob))
> > +             goto nla_put_failure;
> > +
> > +     return nla_nest_end(skb, opts);
> > +
> > +nla_put_failure:
> > +     return -1;
> > +}
> > +
> > +static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
> > +{
> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > +     struct tc_fq_pie_xstats st = {
> > +             .packets_in     = q->stats.packets_in,
> > +             .overlimit      = q->stats.overlimit,
> > +             .dropped        = q->stats.dropped,
> > +             .ecn_mark       = q->stats.ecn_mark,
> > +             .new_flow_count = q->stats.new_flow_count,
> > +     };
> > +     struct list_head *pos;
> > +
> > +     sch_tree_lock(sch);
> > +     list_for_each(pos, &q->new_flows)
> > +             st.new_flows_len++;
> > +
> > +     list_for_each(pos, &q->old_flows)
> > +             st.old_flows_len++;
> > +     sch_tree_unlock(sch);
> > +
> > +     return gnet_stats_copy_app(d, &st, sizeof(st));
> > +}
> > +
> > +static void fq_pie_reset(struct Qdisc *sch)
> > +{
> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > +     u16 idx;
> > +
> > +     INIT_LIST_HEAD(&q->new_flows);
> > +     INIT_LIST_HEAD(&q->old_flows);
> > +     for (idx = 0; idx < q->params.flows_cnt; idx++) {
> > +             struct fq_pie_flow *flow = q->flows + idx;
> > +
> > +             /* Removes all packets from flow */
> > +             rtnl_kfree_skbs(flow->head, flow->tail);
> > +             flow->head = NULL;
> > +
> > +             INIT_LIST_HEAD(&flow->flowchain);
> > +             pie_vars_init(&flow->vars);
> > +     }
> > +
> > +     sch->q.qlen = 0;
> > +     sch->qstats.backlog = 0;
> > +}
> > +
> > +static void fq_pie_destroy(struct Qdisc *sch)
> > +{
> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > +
> > +     kvfree(q->flows);
> > +     del_timer_sync(&q->adapt_timer);
> > +}
> > +
> > +static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
> > +     .id = "fq_pie",
> > +     .priv_size      = sizeof(struct fq_pie_sched_data),
> > +     .enqueue        = fq_pie_qdisc_enqueue,
> > +     .dequeue        = fq_pie_qdisc_dequeue,
> > +     .peek           = qdisc_peek_dequeued,
> > +     .init           = fq_pie_init,
> > +     .destroy        = fq_pie_destroy,
> > +     .reset          = fq_pie_reset,
> > +     .change         = fq_pie_change,
> > +     .dump           = fq_pie_dump,
> > +     .dump_stats     = fq_pie_dump_stats,
> > +     .owner          = THIS_MODULE,
> > +};
> > +
> > +static int __init fq_pie_module_init(void)
> > +{
> > +     return register_qdisc(&fq_pie_qdisc_ops);
> > +}
> > +
> > +static void __exit fq_pie_module_exit(void)
> > +{
> > +     unregister_qdisc(&fq_pie_qdisc_ops);
> > +}
> > +
> > +module_init(fq_pie_module_init);
> > +module_exit(fq_pie_module_exit);
> > +
> > +MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler");
> > +MODULE_AUTHOR("Mohit P. Tahiliani");
> > +MODULE_AUTHOR("Sachin D. Patil");
> > +MODULE_AUTHOR("Mohit Bhasi");
> > +MODULE_AUTHOR("V Saicharan");
> > +MODULE_AUTHOR("Leslie Monis");
> > +MODULE_AUTHOR("Gautam Ramakrishnan");
> > +MODULE_LICENSE("GPL");
> > --
> > 2.17.1
Toke Høiland-Jørgensen April 2, 2019, 5:25 p.m. UTC | #3
Gautam Ramakrishnan <gautamramk@gmail.com> writes:

> Hello, thanks for the feedback
>
> On Tue, Apr 2, 2019 at 4:19 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Some suggestions below to make fq_pie and fq_codel more similar (ref. my
>> previous email).
>>
>> Also, a few unrelated nits.
>>
>> > From: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
>> >
>> > FQ-PIE incorporates fair/flow queuing in which every queue
>> > is managed by an instance of PIE queueing discipline.
>> > The algorithm provides good control over the queueing delay
>> > while at the same time, ensures fairness across various
>> > flows sharing the same link.
>> >
>> > Principles:
>> >   - Packets are classified stochastically on flows.
>> >   - Each flow has a PIE managed queue.
>> >   - Flows are linked onto two (Round Robin) lists,
>> >     so that new flows have priority on old ones.
>> >   - For a given flow, packets are dropped on arrival according
>> >     to a drop probability.
>> >   - Tail drops only.
>>
>> Why tail drops only?
>
> I had mentioned this because packets are dropped only at enqueue.

Yup, realise that; was just wondering why you went with this design
instead of doing the head drop that fq_codel does?

>>
>> > Usage:
>> > tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ]
>> >                     [ alpha NUMBER ] [ beta NUMBER ]
>> >                     [ target TIME us ] [ tupdate TIME us ]
>> >                   [ bytemode ] [ quantum BYTES ]
>> >                   [ ecn | ecn_prob PERCENTAGE ]
>> >
>> > defaults: 1024 flows, 10240 packets limit, quantum : device MTU
>> >            target: 15ms
>> >            tupdate: 15ms
>> >            alpha: 2 (on a scale of 0 to 16)
>> >            beta: 20 (on a scale of 0 to 16)
>> >            ecn: false
>> >            ecn_prob: 10%
>> >
>> > Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
>> > Signed-off-by: Sachin D. Patil <sdp.sachin@gmail.com>
>> > Signed-off-by: Mohit Bhasi <mohitbhasi1998@gmail.com>
>> > Signed-off-by: V. Saicharan <vsaicharan1998@gmail.com>
>> > Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
>> > Signed-off-by: Gautam Ramakrishnan <gautamramk@gmail.com>
>> > Cc: Dave Taht <dave.taht@gmail.com>
>> > ---
>> >  include/uapi/linux/pkt_sched.h |  28 ++
>> >  net/sched/Kconfig              |  14 +-
>> >  net/sched/Makefile             |   1 +
>> >  net/sched/sch_fq_pie.c         | 485 +++++++++++++++++++++++++++++++++
>> >  4 files changed, 527 insertions(+), 1 deletion(-)
>> >  create mode 100644 net/sched/sch_fq_pie.c
>> >
>> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
>> > index 7ee74c3474bf..005413bd09ee 100644
>> > --- a/include/uapi/linux/pkt_sched.h
>> > +++ b/include/uapi/linux/pkt_sched.h
>> > @@ -964,6 +964,34 @@ struct tc_pie_xstats {
>> >       __u32 ecn_mark;         /* packets marked with ecn*/
>> >  };
>> >
>> > +/* FQ PIE */
>> > +enum {
>> > +     TCA_FQ_PIE_UNSPEC,
>> > +     TCA_FQ_PIE_TARGET,
>> > +     TCA_FQ_PIE_LIMIT,
>> > +     TCA_FQ_PIE_TUPDATE,
>> > +     TCA_FQ_PIE_ALPHA,
>> > +     TCA_FQ_PIE_BETA,
>> > +     TCA_FQ_PIE_ECN,
>> > +     TCA_FQ_PIE_QUANTUM,
>> > +     TCA_FQ_PIE_BYTEMODE,
>> > +     TCA_FQ_PIE_FLOWS,
>> > +     TCA_FQ_PIE_ECN_PROB,
>> > +     __TCA_FQ_PIE_MAX
>> > +};
>> > +#define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
>> > +
>> > +struct tc_fq_pie_xstats {
>> > +     __u32 packets_in;       /* total number of packets enqueued */
>> > +     __u32 dropped;          /* packets dropped due to fq_pie_action */
>> > +     __u32 overlimit;        /* dropped due to lack of space in queue */
>> > +     __u32 ecn_mark;         /* packets marked with ecn*/
>> > +     __u32 new_flow_count;   /* number of time packets
>> > +                                created a 'new flow' */
>> > +     __u32 new_flows_len;    /* count of flows in new list */
>> > +     __u32 old_flows_len;    /* count of flows in old list */
>> > +};
>> > +
>> >  /* CBS */
>> >  struct tc_cbs_qopt {
>> >       __u8 offload;
>> > diff --git a/net/sched/Kconfig b/net/sched/Kconfig
>> > index 5c02ad97ef23..49f4dd9894a0 100644
>> > --- a/net/sched/Kconfig
>> > +++ b/net/sched/Kconfig
>> > @@ -358,13 +358,25 @@ config NET_SCH_PIE
>> >       help
>> >         Say Y here if you want to use the Proportional Integral controller
>> >         Enhanced scheduler packet scheduling algorithm.
>> > -       For more information, please see https://tools.ietf.org/html/rfc8033
>> > +       For more information, please see
>> > +       http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
>> >
>> >         To compile this driver as a module, choose M here: the module
>> >         will be called sch_pie.
>> >
>> >         If unsure, say N.
>> >
>> > +config NET_SCH_FQ_PIE
>> > +     tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler"
>> > +     help
>> > +       Say Y here if you want to use the Flow Queue Proportional Integral controller
>> > +       Enhanced scheduler packet scheduling algorithm.
>> > +
>> > +       To compile this driver as a module, choose M here: the module
>> > +       will be called sch_fq_pie.
>> > +
>> > +       If unsure, say N.
>> > +
>> >  config NET_SCH_INGRESS
>> >       tristate "Ingress/classifier-action Qdisc"
>> >       depends on NET_CLS_ACT
>> > diff --git a/net/sched/Makefile b/net/sched/Makefile
>> > index 8a40431d7b5c..fdcd3f7b2fb2 100644
>> > --- a/net/sched/Makefile
>> > +++ b/net/sched/Makefile
>> > @@ -55,6 +55,7 @@ obj-$(CONFIG_NET_SCH_CAKE)  += sch_cake.o
>> >  obj-$(CONFIG_NET_SCH_FQ)     += sch_fq.o
>> >  obj-$(CONFIG_NET_SCH_HHF)    += sch_hhf.o
>> >  obj-$(CONFIG_NET_SCH_PIE)    += sch_pie.o
>> > +obj-$(CONFIG_NET_SCH_FQ_PIE)    += sch_fq_pie.o
>> >  obj-$(CONFIG_NET_SCH_CBS)    += sch_cbs.o
>> >  obj-$(CONFIG_NET_SCH_ETF)    += sch_etf.o
>> >  obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
>> > diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
>> > new file mode 100644
>> > index 000000000000..4ccefa0bc7f0
>> > --- /dev/null
>> > +++ b/net/sched/sch_fq_pie.c
>> > @@ -0,0 +1,485 @@
>> > +/*
>> > + * net/sched/sch_fq_pie.c
>> > + *
>> > + * This program is free software; you can redistribute it and/or
>> > + * modify it under the terms of the GNU General Public License
>> > + * as published by the Free Software Foundation; either version 2
>> > + * of the License.
>> > + *
>> > + * This program is distributed in the hope that it will be useful,
>> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> > + * GNU General Public License for more details.
>>
>> Lose the license boilerplate and replace it with a SPDX header line.
>
> I shall do that.
>
>>
>> > + * Author: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
>> > + * Author: Sachin D. Patil <sdp.sachin@gmail.com>
>> > + * Author: Mohit Bhasi <mohitbhasi1998@gmail.com>
>> > + * Author: V Saicharan <vsaicharan1998@gmail.com>
>> > + * Author: Leslie Monis <lesliemonis@gmail.com>
>> > + * Author: Gautam Ramakrishnan <gautamramk@gmail.com>
>> > + *
>> > + * References:
>> > + * RFC 8033: https://tools.ietf.org/html/rfc8033
>> > + */
>> > +
>> > +#include <linux/jhash.h>
>> > +#include <linux/vmalloc.h>
>> > +#include <net/pie.h>
>> > +
>> > +struct fq_pie_params {
>> > +     struct pie_params p_params;
>> > +     u32 ecn_prob;
>> > +     u32 flows_cnt;
>> > +};
>> > +
>> > +struct fq_pie_stats {
>> > +     u32 packets_in;         /* total number of packets enqueued */
>> > +     u32 dropped;            /* packets dropped due to fq_pie action */
>> > +     u32 overlimit;          /* dropped due to lack of space in queue */
>> > +     u32 ecn_mark;           /* packets marked with ECN */
>> > +     u32 new_flow_count;     /* number of time packets created a new flow */
>> > +};
>> > +
>> > +struct fq_pie_flow {
>> > +     s32 deficit;            /* number of credits remaining for the flow */
>> > +     u32 backlog;            /* size of data in the flow */
>> > +     u32 qlen;               /* number of packets in the flow */
>> > +     struct sk_buff *head;
>> > +     struct sk_buff *tail;
>> > +     struct list_head flowchain;
>> > +     struct pie_vars vars;   /* pie vars for the flow */
>> > +     struct pie_stats stats; /* pie stats for the flow */
>> > +};
>> > +
>> > +struct fq_pie_sched_data {
>> > +     u32 quantum;            /* number of credits in deficit round robin */
>> > +     struct fq_pie_flow *flows;
>> > +     struct Qdisc *sch;
>> > +     struct fq_pie_params params;
>> > +     struct fq_pie_stats stats;
>> > +     struct list_head old_flows;
>> > +     struct list_head new_flows;
>> > +     struct timer_list adapt_timer;
>> > +};
>>
>> The flow and sched_data structs have quite a bit in common with those in
>> fq_codel; but the members are in a different order.
>>
>> > +static void fq_pie_params_init(struct fq_pie_params *params)
>> > +{
>> > +     pie_params_init(&params->p_params);
>> > +     params->ecn_prob = 10;
>> > +     params->flows_cnt = 1024;
>> > +}
>> > +
>> > +static inline void flow_queue_add(struct fq_pie_flow *flow,
>> > +                               struct sk_buff *skb)
>> > +{
>> > +     if (!flow->head)
>> > +             flow->head = skb;
>> > +     else
>> > +             flow->tail->next = skb;
>> > +     flow->tail = skb;
>> > +     skb->next = NULL;
>> > +}
>> > +
>> > +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
>> > +                             struct sk_buff **to_free)
>> > +{
>> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
>> > +     struct fq_pie_flow *sel_flow;
>> > +     u32 pkt_len;
>> > +     u32 idx;
>> > +     u8 enqueue = false;
>> > +
>> > +     /* Classifies packet into corresponding flow */
>> > +     idx = reciprocal_scale(skb_get_hash(skb), q->params.flows_cnt);
>> > +     sel_flow = &q->flows[idx];
>>
>> This is missing the ability to override the classification from tc
>> filters. See fq_codel_classify().
>
> I wanted to keep it simple initially. Shall add that.
>
>>
>> > +
>> > +     /* Checks if the qdisc is full */
>> > +     if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
>> > +             q->stats.overlimit++;
>> > +             sel_flow->stats.overlimit++;
>> > +             goto out;
>> > +     }
>>
>> The memory_limit checks in fq_codel have turned out to be quite useful
>> on constrained systems. I'd suggest adding them here as well.
>
> I shall add that too.
>
>>
>> > +
>> > +     if (!drop_early(sch, sel_flow->backlog, skb->len, &sel_flow->vars,
>> > +                     &q->params.p_params)) {
>> > +             enqueue = true;
>> > +     } else if (q->params.p_params.ecn &&
>> > +                sel_flow->vars.prob <=
>> > +                (MAX_PROB / 100) * q->params.ecn_prob &&
>> > +                INET_ECN_set_ce(skb)) {
>> > +             /* If packet is ecn capable, mark it if drop probability
>> > +              * is lower than the parameter ecn_prob, else drop it.
>> > +              */
>> > +             q->stats.ecn_mark++;
>> > +             sel_flow->stats.ecn_mark++;
>> > +             enqueue = true;
>> > +     }
>> > +     if (enqueue) {
>> > +             pkt_len = qdisc_pkt_len(skb);
>> > +             q->stats.packets_in++;
>> > +             sch->qstats.backlog += pkt_len;
>> > +             sch->q.qlen++;
>> > +             flow_queue_add(sel_flow, skb);
>> > +             if (list_empty(&sel_flow->flowchain)) {
>> > +                     list_add_tail(&sel_flow->flowchain, &q->new_flows);
>> > +                     q->stats.new_flow_count++;
>> > +                     sel_flow->deficit = q->quantum;
>> > +                     sel_flow->stats.dropped = 0;
>> > +                     sel_flow->qlen = 0;
>> > +                     sel_flow->backlog = 0;
>> > +             }
>> > +             sel_flow->qlen++;
>> > +             sel_flow->stats.packets_in++;
>> > +             sel_flow->backlog += pkt_len;
>> > +             return NET_XMIT_SUCCESS;
>> > +     }
>> > +out:
>> > +     q->stats.dropped++;
>> > +     sel_flow->stats.dropped++;
>> > +     return qdisc_drop(skb, sch, to_free);
>>
>> You probably want to return NET_XMIT_CN instead of NET_XMIT_DROP here.
>>
>
> Should I replace qdisc_drop with __qdisc_drop and return NET_XMIT_CN?

Yeah, I think that would be more appropriate here, as that would
directly throttle sockets on the same host.

>> > +}
>> > +
>> > +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
>> > +     [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
>> > +     [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}
>> > +};
>> > +
>> > +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
>> > +{
>> > +     struct sk_buff *skb = flow->head;
>> > +
>> > +     flow->head = skb->next;
>> > +     skb->next = NULL;
>> > +     return skb;
>> > +}
>> > +
>> > +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
>> > +{
>> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
>> > +     struct sk_buff *skb = NULL;
>> > +     struct fq_pie_flow *flow;
>> > +     struct list_head *head;
>> > +     u32 pkt_len;
>> > +
>> > +begin:
>> > +     head = &q->new_flows;
>> > +     if (list_empty(head)) {
>> > +             head = &q->old_flows;
>> > +             if (list_empty(head))
>> > +                     return NULL;
>> > +     }
>> > +
>> > +     flow = list_first_entry(head, struct fq_pie_flow, flowchain);
>> > +     /* Flow has exhausted all its credits */
>> > +     if (flow->deficit <= 0) {
>> > +             flow->deficit += q->quantum;
>> > +             list_move_tail(&flow->flowchain, &q->old_flows);
>> > +             goto begin;
>> > +     }
>> > +
>> > +     if (flow->head) {
>> > +             skb = dequeue_head(flow);
>> > +             pkt_len = qdisc_pkt_len(skb);
>> > +             sch->qstats.backlog -= pkt_len;
>> > +             sch->q.qlen--;
>> > +             qdisc_bstats_update(sch, skb);
>> > +     }
>>
>> If you factor this out into a dequeue_func(), this dequeue() function is
>> very close to identical to the one in fq_codel().
>
> Do you suggest I put the if(flow->head) block in a different function?

Yeah, that was what I meant. However, without doing the full refactoring
it's only borderline useful, so feel free to just keep it the way it
is...

-Toke
Gautam Ramakrishnan April 8, 2019, 5:31 a.m. UTC | #4
I was trying to refactor the code and I ran into some issues.
1. I moved some of the parameters such as flows_cnt into a new struct
called fq_pie_params, instead of keeping them in fq_pie_sched_data.
Should I move those parameters back into fq_pie_sched_data?
2. fq_codel maintains the backlog variable as a list in the
fq_codel_sched_data, whereas I maintain a backlog in the struct
fq_pie_flow. What approach should I follow?
3. Would maintaining a per flow pie_stats be useful in the future? I
do not use the per flow stats anywhere in the code.

On Tue, Apr 2, 2019 at 10:55 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Gautam Ramakrishnan <gautamramk@gmail.com> writes:
>
> > Hello, thanks for the feedback
> >
> > On Tue, Apr 2, 2019 at 4:19 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> Some suggestions below to make fq_pie and fq_codel more similar (ref. my
> >> previous email).
> >>
> >> Also, a few unrelated nits.
> >>
> >> > From: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> >> >
> >> > FQ-PIE incorporates fair/flow queuing in which every queue
> >> > is managed by an instance of PIE queueing discipline.
> >> > The algorithm provides good control over the queueing delay
> >> > while at the same time, ensures fairness across various
> >> > flows sharing the same link.
> >> >
> >> > Principles:
> >> >   - Packets are classified stochastically on flows.
> >> >   - Each flow has a PIE managed queue.
> >> >   - Flows are linked onto two (Round Robin) lists,
> >> >     so that new flows have priority on old ones.
> >> >   - For a given flow, packets are dropped on arrival according
> >> >     to a drop probability.
> >> >   - Tail drops only.
> >>
> >> Why tail drops only?
> >
> > I had mentioned this because packets are dropped only at enqueue.
>
> Yup, realise that; was just wondering why you went with this design
> instead of doing the head drop that fq_codel does?
>
> >>
> >> > Usage:
> >> > tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ]
> >> >                     [ alpha NUMBER ] [ beta NUMBER ]
> >> >                     [ target TIME us ] [ tupdate TIME us ]
> >> >                   [ bytemode ] [ quantum BYTES ]
> >> >                   [ ecn | ecn_prob PERCENTAGE ]
> >> >
> >> > defaults: 1024 flows, 10240 packets limit, quantum : device MTU
> >> >            target: 15ms
> >> >            tupdate: 15ms
> >> >            alpha: 2 (on a scale of 0 to 16)
> >> >            beta: 20 (on a scale of 0 to 16)
> >> >            ecn: false
> >> >            ecn_prob: 10%
> >> >
> >> > Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> >> > Signed-off-by: Sachin D. Patil <sdp.sachin@gmail.com>
> >> > Signed-off-by: Mohit Bhasi <mohitbhasi1998@gmail.com>
> >> > Signed-off-by: V. Saicharan <vsaicharan1998@gmail.com>
> >> > Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
> >> > Signed-off-by: Gautam Ramakrishnan <gautamramk@gmail.com>
> >> > Cc: Dave Taht <dave.taht@gmail.com>
> >> > ---
> >> >  include/uapi/linux/pkt_sched.h |  28 ++
> >> >  net/sched/Kconfig              |  14 +-
> >> >  net/sched/Makefile             |   1 +
> >> >  net/sched/sch_fq_pie.c         | 485 +++++++++++++++++++++++++++++++++
> >> >  4 files changed, 527 insertions(+), 1 deletion(-)
> >> >  create mode 100644 net/sched/sch_fq_pie.c
> >> >
> >> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> >> > index 7ee74c3474bf..005413bd09ee 100644
> >> > --- a/include/uapi/linux/pkt_sched.h
> >> > +++ b/include/uapi/linux/pkt_sched.h
> >> > @@ -964,6 +964,34 @@ struct tc_pie_xstats {
> >> >       __u32 ecn_mark;         /* packets marked with ecn*/
> >> >  };
> >> >
> >> > +/* FQ PIE */
> >> > +enum {
> >> > +     TCA_FQ_PIE_UNSPEC,
> >> > +     TCA_FQ_PIE_TARGET,
> >> > +     TCA_FQ_PIE_LIMIT,
> >> > +     TCA_FQ_PIE_TUPDATE,
> >> > +     TCA_FQ_PIE_ALPHA,
> >> > +     TCA_FQ_PIE_BETA,
> >> > +     TCA_FQ_PIE_ECN,
> >> > +     TCA_FQ_PIE_QUANTUM,
> >> > +     TCA_FQ_PIE_BYTEMODE,
> >> > +     TCA_FQ_PIE_FLOWS,
> >> > +     TCA_FQ_PIE_ECN_PROB,
> >> > +     __TCA_FQ_PIE_MAX
> >> > +};
> >> > +#define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
> >> > +
> >> > +struct tc_fq_pie_xstats {
> >> > +     __u32 packets_in;       /* total number of packets enqueued */
> >> > +     __u32 dropped;          /* packets dropped due to fq_pie_action */
> >> > +     __u32 overlimit;        /* dropped due to lack of space in queue */
> >> > +     __u32 ecn_mark;         /* packets marked with ecn*/
> >> > +     __u32 new_flow_count;   /* number of time packets
> >> > +                                created a 'new flow' */
> >> > +     __u32 new_flows_len;    /* count of flows in new list */
> >> > +     __u32 old_flows_len;    /* count of flows in old list */
> >> > +};
> >> > +
> >> >  /* CBS */
> >> >  struct tc_cbs_qopt {
> >> >       __u8 offload;
> >> > diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> >> > index 5c02ad97ef23..49f4dd9894a0 100644
> >> > --- a/net/sched/Kconfig
> >> > +++ b/net/sched/Kconfig
> >> > @@ -358,13 +358,25 @@ config NET_SCH_PIE
> >> >       help
> >> >         Say Y here if you want to use the Proportional Integral controller
> >> >         Enhanced scheduler packet scheduling algorithm.
> >> > -       For more information, please see https://tools.ietf.org/html/rfc8033
> >> > +       For more information, please see
> >> > +       http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
> >> >
> >> >         To compile this driver as a module, choose M here: the module
> >> >         will be called sch_pie.
> >> >
> >> >         If unsure, say N.
> >> >
> >> > +config NET_SCH_FQ_PIE
> >> > +     tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler"
> >> > +     help
> >> > +       Say Y here if you want to use the Flow Queue Proportional Integral controller
> >> > +       Enhanced scheduler packet scheduling algorithm.
> >> > +
> >> > +       To compile this driver as a module, choose M here: the module
> >> > +       will be called sch_fq_pie.
> >> > +
> >> > +       If unsure, say N.
> >> > +
> >> >  config NET_SCH_INGRESS
> >> >       tristate "Ingress/classifier-action Qdisc"
> >> >       depends on NET_CLS_ACT
> >> > diff --git a/net/sched/Makefile b/net/sched/Makefile
> >> > index 8a40431d7b5c..fdcd3f7b2fb2 100644
> >> > --- a/net/sched/Makefile
> >> > +++ b/net/sched/Makefile
> >> > @@ -55,6 +55,7 @@ obj-$(CONFIG_NET_SCH_CAKE)  += sch_cake.o
> >> >  obj-$(CONFIG_NET_SCH_FQ)     += sch_fq.o
> >> >  obj-$(CONFIG_NET_SCH_HHF)    += sch_hhf.o
> >> >  obj-$(CONFIG_NET_SCH_PIE)    += sch_pie.o
> >> > +obj-$(CONFIG_NET_SCH_FQ_PIE)    += sch_fq_pie.o
> >> >  obj-$(CONFIG_NET_SCH_CBS)    += sch_cbs.o
> >> >  obj-$(CONFIG_NET_SCH_ETF)    += sch_etf.o
> >> >  obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
> >> > diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
> >> > new file mode 100644
> >> > index 000000000000..4ccefa0bc7f0
> >> > --- /dev/null
> >> > +++ b/net/sched/sch_fq_pie.c
> >> > @@ -0,0 +1,485 @@
> >> > +/*
> >> > + * net/sched/sch_fq_pie.c
> >> > + *
> >> > + * This program is free software; you can redistribute it and/or
> >> > + * modify it under the terms of the GNU General Public License
> >> > + * as published by the Free Software Foundation; either version 2
> >> > + * of the License.
> >> > + *
> >> > + * This program is distributed in the hope that it will be useful,
> >> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >> > + * GNU General Public License for more details.
> >>
> >> Lose the license boilerplate and replace it with a SPDX header line.
> >
> > I shall do that.
> >
> >>
> >> > + * Author: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> >> > + * Author: Sachin D. Patil <sdp.sachin@gmail.com>
> >> > + * Author: Mohit Bhasi <mohitbhasi1998@gmail.com>
> >> > + * Author: V Saicharan <vsaicharan1998@gmail.com>
> >> > + * Author: Leslie Monis <lesliemonis@gmail.com>
> >> > + * Author: Gautam Ramakrishnan <gautamramk@gmail.com>
> >> > + *
> >> > + * References:
> >> > + * RFC 8033: https://tools.ietf.org/html/rfc8033
> >> > + */
> >> > +
> >> > +#include <linux/jhash.h>
> >> > +#include <linux/vmalloc.h>
> >> > +#include <net/pie.h>
> >> > +
> >> > +struct fq_pie_params {
> >> > +     struct pie_params p_params;
> >> > +     u32 ecn_prob;
> >> > +     u32 flows_cnt;
> >> > +};
> >> > +
> >> > +struct fq_pie_stats {
> >> > +     u32 packets_in;         /* total number of packets enqueued */
> >> > +     u32 dropped;            /* packets dropped due to fq_pie action */
> >> > +     u32 overlimit;          /* dropped due to lack of space in queue */
> >> > +     u32 ecn_mark;           /* packets marked with ECN */
> >> > +     u32 new_flow_count;     /* number of time packets created a new flow */
> >> > +};
> >> > +
> >> > +struct fq_pie_flow {
> >> > +     s32 deficit;            /* number of credits remaining for the flow */
> >> > +     u32 backlog;            /* size of data in the flow */
> >> > +     u32 qlen;               /* number of packets in the flow */
> >> > +     struct sk_buff *head;
> >> > +     struct sk_buff *tail;
> >> > +     struct list_head flowchain;
> >> > +     struct pie_vars vars;   /* pie vars for the flow */
> >> > +     struct pie_stats stats; /* pie stats for the flow */
> >> > +};
> >> > +
> >> > +struct fq_pie_sched_data {
> >> > +     u32 quantum;            /* number of credits in deficit round robin */
> >> > +     struct fq_pie_flow *flows;
> >> > +     struct Qdisc *sch;
> >> > +     struct fq_pie_params params;
> >> > +     struct fq_pie_stats stats;
> >> > +     struct list_head old_flows;
> >> > +     struct list_head new_flows;
> >> > +     struct timer_list adapt_timer;
> >> > +};
> >>
> >> The flow and sched_data structs have quite a bit in common with those in
> >> fq_codel; but the members are in a different order.
> >>
> >> > +static void fq_pie_params_init(struct fq_pie_params *params)
> >> > +{
> >> > +     pie_params_init(&params->p_params);
> >> > +     params->ecn_prob = 10;
> >> > +     params->flows_cnt = 1024;
> >> > +}
> >> > +
> >> > +static inline void flow_queue_add(struct fq_pie_flow *flow,
> >> > +                               struct sk_buff *skb)
> >> > +{
> >> > +     if (!flow->head)
> >> > +             flow->head = skb;
> >> > +     else
> >> > +             flow->tail->next = skb;
> >> > +     flow->tail = skb;
> >> > +     skb->next = NULL;
> >> > +}
> >> > +
> >> > +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> >> > +                             struct sk_buff **to_free)
> >> > +{
> >> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> >> > +     struct fq_pie_flow *sel_flow;
> >> > +     u32 pkt_len;
> >> > +     u32 idx;
> >> > +     u8 enqueue = false;
> >> > +
> >> > +     /* Classifies packet into corresponding flow */
> >> > +     idx = reciprocal_scale(skb_get_hash(skb), q->params.flows_cnt);
> >> > +     sel_flow = &q->flows[idx];
> >>
> >> This is missing the ability to override the classification from tc
> >> filters. See fq_codel_classify().
> >
> > I wanted to keep it simple initially. Shall add that.
> >
> >>
> >> > +
> >> > +     /* Checks if the qdisc is full */
> >> > +     if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
> >> > +             q->stats.overlimit++;
> >> > +             sel_flow->stats.overlimit++;
> >> > +             goto out;
> >> > +     }
> >>
> >> The memory_limit checks in fq_codel have turned out to be quite useful
> >> on constrained systems. I'd suggest adding them here as well.
> >
> > I shall add that too.
> >
> >>
> >> > +
> >> > +     if (!drop_early(sch, sel_flow->backlog, skb->len, &sel_flow->vars,
> >> > +                     &q->params.p_params)) {
> >> > +             enqueue = true;
> >> > +     } else if (q->params.p_params.ecn &&
> >> > +                sel_flow->vars.prob <=
> >> > +                (MAX_PROB / 100) * q->params.ecn_prob &&
> >> > +                INET_ECN_set_ce(skb)) {
> >> > +             /* If packet is ecn capable, mark it if drop probability
> >> > +              * is lower than the parameter ecn_prob, else drop it.
> >> > +              */
> >> > +             q->stats.ecn_mark++;
> >> > +             sel_flow->stats.ecn_mark++;
> >> > +             enqueue = true;
> >> > +     }
> >> > +     if (enqueue) {
> >> > +             pkt_len = qdisc_pkt_len(skb);
> >> > +             q->stats.packets_in++;
> >> > +             sch->qstats.backlog += pkt_len;
> >> > +             sch->q.qlen++;
> >> > +             flow_queue_add(sel_flow, skb);
> >> > +             if (list_empty(&sel_flow->flowchain)) {
> >> > +                     list_add_tail(&sel_flow->flowchain, &q->new_flows);
> >> > +                     q->stats.new_flow_count++;
> >> > +                     sel_flow->deficit = q->quantum;
> >> > +                     sel_flow->stats.dropped = 0;
> >> > +                     sel_flow->qlen = 0;
> >> > +                     sel_flow->backlog = 0;
> >> > +             }
> >> > +             sel_flow->qlen++;
> >> > +             sel_flow->stats.packets_in++;
> >> > +             sel_flow->backlog += pkt_len;
> >> > +             return NET_XMIT_SUCCESS;
> >> > +     }
> >> > +out:
> >> > +     q->stats.dropped++;
> >> > +     sel_flow->stats.dropped++;
> >> > +     return qdisc_drop(skb, sch, to_free);
> >>
> >> You probably want to return NET_XMIT_CN instead of NET_XMIT_DROP here.
> >>
> >
> > Should I replace qdisc_drop with __qdisc_drop and return NET_XMIT_CN?
>
> Yeah, I think that would be more appropriate here, as that would
> directly throttle sockets on the same host.
>
> >> > +}
> >> > +
> >> > +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
> >> > +     [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
> >> > +     [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}
> >> > +};
> >> > +
> >> > +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
> >> > +{
> >> > +     struct sk_buff *skb = flow->head;
> >> > +
> >> > +     flow->head = skb->next;
> >> > +     skb->next = NULL;
> >> > +     return skb;
> >> > +}
> >> > +
> >> > +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
> >> > +{
> >> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> >> > +     struct sk_buff *skb = NULL;
> >> > +     struct fq_pie_flow *flow;
> >> > +     struct list_head *head;
> >> > +     u32 pkt_len;
> >> > +
> >> > +begin:
> >> > +     head = &q->new_flows;
> >> > +     if (list_empty(head)) {
> >> > +             head = &q->old_flows;
> >> > +             if (list_empty(head))
> >> > +                     return NULL;
> >> > +     }
> >> > +
> >> > +     flow = list_first_entry(head, struct fq_pie_flow, flowchain);
> >> > +     /* Flow has exhausted all its credits */
> >> > +     if (flow->deficit <= 0) {
> >> > +             flow->deficit += q->quantum;
> >> > +             list_move_tail(&flow->flowchain, &q->old_flows);
> >> > +             goto begin;
> >> > +     }
> >> > +
> >> > +     if (flow->head) {
> >> > +             skb = dequeue_head(flow);
> >> > +             pkt_len = qdisc_pkt_len(skb);
> >> > +             sch->qstats.backlog -= pkt_len;
> >> > +             sch->q.qlen--;
> >> > +             qdisc_bstats_update(sch, skb);
> >> > +     }
> >>
> >> If you factor this out into a dequeue_func(), this dequeue() function is
> >> very close to identical to the one in fq_codel().
> >
> > Do you suggest I put the if(flow->head) block in a different function?
>
> Yeah, that was what I meant. However, without doing the full refactoring
> it's only borderline useful, so feel free to just keep it the way it
> is...
>
> -Toke
Dave Taht April 8, 2019, 5:37 a.m. UTC | #5
On Mon, Apr 8, 2019 at 7:31 AM Gautam Ramakrishnan <gautamramk@gmail.com> wrote:
>
> I was trying to refactor the code and I ran into some issues.
> 1. I moved some of the parameters such as flows_cnt into a new struct
> called fq_pie_params, instead of keeping them in fq_pie_sched_data.
> Should I move those parameters back into fq_pie_sched_data?
> 2. fq_codel maintains the backlog variable as a list in the
> fq_codel_sched_data, whereas I maintain a backlog in the struct
> fq_pie_flow. What approach should I follow?

Hmm. I had made some efforts to speed up fq_codel here:
https://github.com/dtaht/fq_codel_fast

based on ideas here:
https://lists.bufferbloat.net/pipermail/codel/2018-August/002367.html

Notably I wanted to get rid of the non O(1) bulk dropper search and
keep the flow backlog closer to the flow stats.
Needed oprofiling to see if it helped.

> 3. Would maintaining a per flow pie_stats be useful in the future? I
> do not use the per flow stats anywhere in the code.

In general I have not found per flow stats useful or accessible and it
was one of the things I eliminated in fq_codel_fast
> On Tue, Apr 2, 2019 at 10:55 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >
> > Gautam Ramakrishnan <gautamramk@gmail.com> writes:
> >
> > > Hello, thanks for the feedback
> > >
> > > On Tue, Apr 2, 2019 at 4:19 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> > >>
> > >> Some suggestions below to make fq_pie and fq_codel more similar (ref. my
> > >> previous email).
> > >>
> > >> Also, a few unrelated nits.
> > >>
> > >> > From: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> > >> >
> > >> > FQ-PIE incorporates fair/flow queuing in which every queue
> > >> > is managed by an instance of PIE queueing discipline.
> > >> > The algorithm provides good control over the queueing delay
> > >> > while at the same time, ensures fairness across various
> > >> > flows sharing the same link.
> > >> >
> > >> > Principles:
> > >> >   - Packets are classified stochastically on flows.
> > >> >   - Each flow has a PIE managed queue.
> > >> >   - Flows are linked onto two (Round Robin) lists,
> > >> >     so that new flows have priority on old ones.
> > >> >   - For a given flow, packets are dropped on arrival according
> > >> >     to a drop probability.
> > >> >   - Tail drops only.
> > >>
> > >> Why tail drops only?
> > >
> > > I had mentioned this because packets are dropped only at enqueue.
> >
> > Yup, realise that; was just wondering why you went with this design
> > instead of doing the head drop that fq_codel does?
> >
> > >>
> > >> > Usage:
> > >> > tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ]
> > >> >                     [ alpha NUMBER ] [ beta NUMBER ]
> > >> >                     [ target TIME us ] [ tupdate TIME us ]
> > >> >                   [ bytemode ] [ quantum BYTES ]
> > >> >                   [ ecn | ecn_prob PERCENTAGE ]
> > >> >
> > >> > defaults: 1024 flows, 10240 packets limit, quantum : device MTU
> > >> >            target: 15ms
> > >> >            tupdate: 15ms
> > >> >            alpha: 2 (on a scale of 0 to 16)
> > >> >            beta: 20 (on a scale of 0 to 16)
> > >> >            ecn: false
> > >> >            ecn_prob: 10%
> > >> >
> > >> > Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> > >> > Signed-off-by: Sachin D. Patil <sdp.sachin@gmail.com>
> > >> > Signed-off-by: Mohit Bhasi <mohitbhasi1998@gmail.com>
> > >> > Signed-off-by: V. Saicharan <vsaicharan1998@gmail.com>
> > >> > Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
> > >> > Signed-off-by: Gautam Ramakrishnan <gautamramk@gmail.com>
> > >> > Cc: Dave Taht <dave.taht@gmail.com>
> > >> > ---
> > >> >  include/uapi/linux/pkt_sched.h |  28 ++
> > >> >  net/sched/Kconfig              |  14 +-
> > >> >  net/sched/Makefile             |   1 +
> > >> >  net/sched/sch_fq_pie.c         | 485 +++++++++++++++++++++++++++++++++
> > >> >  4 files changed, 527 insertions(+), 1 deletion(-)
> > >> >  create mode 100644 net/sched/sch_fq_pie.c
> > >> >
> > >> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> > >> > index 7ee74c3474bf..005413bd09ee 100644
> > >> > --- a/include/uapi/linux/pkt_sched.h
> > >> > +++ b/include/uapi/linux/pkt_sched.h
> > >> > @@ -964,6 +964,34 @@ struct tc_pie_xstats {
> > >> >       __u32 ecn_mark;         /* packets marked with ecn*/
> > >> >  };
> > >> >
> > >> > +/* FQ PIE */
> > >> > +enum {
> > >> > +     TCA_FQ_PIE_UNSPEC,
> > >> > +     TCA_FQ_PIE_TARGET,
> > >> > +     TCA_FQ_PIE_LIMIT,
> > >> > +     TCA_FQ_PIE_TUPDATE,
> > >> > +     TCA_FQ_PIE_ALPHA,
> > >> > +     TCA_FQ_PIE_BETA,
> > >> > +     TCA_FQ_PIE_ECN,
> > >> > +     TCA_FQ_PIE_QUANTUM,
> > >> > +     TCA_FQ_PIE_BYTEMODE,
> > >> > +     TCA_FQ_PIE_FLOWS,
> > >> > +     TCA_FQ_PIE_ECN_PROB,
> > >> > +     __TCA_FQ_PIE_MAX
> > >> > +};
> > >> > +#define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
> > >> > +
> > >> > +struct tc_fq_pie_xstats {
> > >> > +     __u32 packets_in;       /* total number of packets enqueued */
> > >> > +     __u32 dropped;          /* packets dropped due to fq_pie_action */
> > >> > +     __u32 overlimit;        /* dropped due to lack of space in queue */
> > >> > +     __u32 ecn_mark;         /* packets marked with ecn*/
> > >> > +     __u32 new_flow_count;   /* number of time packets
> > >> > +                                created a 'new flow' */
> > >> > +     __u32 new_flows_len;    /* count of flows in new list */
> > >> > +     __u32 old_flows_len;    /* count of flows in old list */
> > >> > +};
> > >> > +
> > >> >  /* CBS */
> > >> >  struct tc_cbs_qopt {
> > >> >       __u8 offload;
> > >> > diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> > >> > index 5c02ad97ef23..49f4dd9894a0 100644
> > >> > --- a/net/sched/Kconfig
> > >> > +++ b/net/sched/Kconfig
> > >> > @@ -358,13 +358,25 @@ config NET_SCH_PIE
> > >> >       help
> > >> >         Say Y here if you want to use the Proportional Integral controller
> > >> >         Enhanced scheduler packet scheduling algorithm.
> > >> > -       For more information, please see https://tools.ietf.org/html/rfc8033
> > >> > +       For more information, please see
> > >> > +       http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
> > >> >
> > >> >         To compile this driver as a module, choose M here: the module
> > >> >         will be called sch_pie.
> > >> >
> > >> >         If unsure, say N.
> > >> >
> > >> > +config NET_SCH_FQ_PIE
> > >> > +     tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler"
> > >> > +     help
> > >> > +       Say Y here if you want to use the Flow Queue Proportional Integral controller
> > >> > +       Enhanced scheduler packet scheduling algorithm.
> > >> > +
> > >> > +       To compile this driver as a module, choose M here: the module
> > >> > +       will be called sch_fq_pie.
> > >> > +
> > >> > +       If unsure, say N.
> > >> > +
> > >> >  config NET_SCH_INGRESS
> > >> >       tristate "Ingress/classifier-action Qdisc"
> > >> >       depends on NET_CLS_ACT
> > >> > diff --git a/net/sched/Makefile b/net/sched/Makefile
> > >> > index 8a40431d7b5c..fdcd3f7b2fb2 100644
> > >> > --- a/net/sched/Makefile
> > >> > +++ b/net/sched/Makefile
> > >> > @@ -55,6 +55,7 @@ obj-$(CONFIG_NET_SCH_CAKE)  += sch_cake.o
> > >> >  obj-$(CONFIG_NET_SCH_FQ)     += sch_fq.o
> > >> >  obj-$(CONFIG_NET_SCH_HHF)    += sch_hhf.o
> > >> >  obj-$(CONFIG_NET_SCH_PIE)    += sch_pie.o
> > >> > +obj-$(CONFIG_NET_SCH_FQ_PIE)    += sch_fq_pie.o
> > >> >  obj-$(CONFIG_NET_SCH_CBS)    += sch_cbs.o
> > >> >  obj-$(CONFIG_NET_SCH_ETF)    += sch_etf.o
> > >> >  obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
> > >> > diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
> > >> > new file mode 100644
> > >> > index 000000000000..4ccefa0bc7f0
> > >> > --- /dev/null
> > >> > +++ b/net/sched/sch_fq_pie.c
> > >> > @@ -0,0 +1,485 @@
> > >> > +/*
> > >> > + * net/sched/sch_fq_pie.c
> > >> > + *
> > >> > + * This program is free software; you can redistribute it and/or
> > >> > + * modify it under the terms of the GNU General Public License
> > >> > + * as published by the Free Software Foundation; either version 2
> > >> > + * of the License.
> > >> > + *
> > >> > + * This program is distributed in the hope that it will be useful,
> > >> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > >> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > >> > + * GNU General Public License for more details.
> > >>
> > >> Lose the license boilerplate and replace it with a SPDX header line.
> > >
> > > I shall do that.
> > >
> > >>
> > >> > + * Author: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> > >> > + * Author: Sachin D. Patil <sdp.sachin@gmail.com>
> > >> > + * Author: Mohit Bhasi <mohitbhasi1998@gmail.com>
> > >> > + * Author: V Saicharan <vsaicharan1998@gmail.com>
> > >> > + * Author: Leslie Monis <lesliemonis@gmail.com>
> > >> > + * Author: Gautam Ramakrishnan <gautamramk@gmail.com>
> > >> > + *
> > >> > + * References:
> > >> > + * RFC 8033: https://tools.ietf.org/html/rfc8033
> > >> > + */
> > >> > +
> > >> > +#include <linux/jhash.h>
> > >> > +#include <linux/vmalloc.h>
> > >> > +#include <net/pie.h>
> > >> > +
> > >> > +struct fq_pie_params {
> > >> > +     struct pie_params p_params;
> > >> > +     u32 ecn_prob;
> > >> > +     u32 flows_cnt;
> > >> > +};
> > >> > +
> > >> > +struct fq_pie_stats {
> > >> > +     u32 packets_in;         /* total number of packets enqueued */
> > >> > +     u32 dropped;            /* packets dropped due to fq_pie action */
> > >> > +     u32 overlimit;          /* dropped due to lack of space in queue */
> > >> > +     u32 ecn_mark;           /* packets marked with ECN */
> > >> > +     u32 new_flow_count;     /* number of time packets created a new flow */
> > >> > +};
> > >> > +
> > >> > +struct fq_pie_flow {
> > >> > +     s32 deficit;            /* number of credits remaining for the flow */
> > >> > +     u32 backlog;            /* size of data in the flow */
> > >> > +     u32 qlen;               /* number of packets in the flow */
> > >> > +     struct sk_buff *head;
> > >> > +     struct sk_buff *tail;
> > >> > +     struct list_head flowchain;
> > >> > +     struct pie_vars vars;   /* pie vars for the flow */
> > >> > +     struct pie_stats stats; /* pie stats for the flow */
> > >> > +};
> > >> > +
> > >> > +struct fq_pie_sched_data {
> > >> > +     u32 quantum;            /* number of credits in deficit round robin */
> > >> > +     struct fq_pie_flow *flows;
> > >> > +     struct Qdisc *sch;
> > >> > +     struct fq_pie_params params;
> > >> > +     struct fq_pie_stats stats;
> > >> > +     struct list_head old_flows;
> > >> > +     struct list_head new_flows;
> > >> > +     struct timer_list adapt_timer;
> > >> > +};
> > >>
> > >> The flow and sched_data structs have quite a bit in common with those in
> > >> fq_codel; but the members are in a different order.
> > >>
> > >> > +static void fq_pie_params_init(struct fq_pie_params *params)
> > >> > +{
> > >> > +     pie_params_init(&params->p_params);
> > >> > +     params->ecn_prob = 10;
> > >> > +     params->flows_cnt = 1024;
> > >> > +}
> > >> > +
> > >> > +static inline void flow_queue_add(struct fq_pie_flow *flow,
> > >> > +                               struct sk_buff *skb)
> > >> > +{
> > >> > +     if (!flow->head)
> > >> > +             flow->head = skb;
> > >> > +     else
> > >> > +             flow->tail->next = skb;
> > >> > +     flow->tail = skb;
> > >> > +     skb->next = NULL;
> > >> > +}
> > >> > +
> > >> > +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> > >> > +                             struct sk_buff **to_free)
> > >> > +{
> > >> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > >> > +     struct fq_pie_flow *sel_flow;
> > >> > +     u32 pkt_len;
> > >> > +     u32 idx;
> > >> > +     u8 enqueue = false;
> > >> > +
> > >> > +     /* Classifies packet into corresponding flow */
> > >> > +     idx = reciprocal_scale(skb_get_hash(skb), q->params.flows_cnt);
> > >> > +     sel_flow = &q->flows[idx];
> > >>
> > >> This is missing the ability to override the classification from tc
> > >> filters. See fq_codel_classify().
> > >
> > > I wanted to keep it simple initially. Shall add that.
> > >
> > >>
> > >> > +
> > >> > +     /* Checks if the qdisc is full */
> > >> > +     if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
> > >> > +             q->stats.overlimit++;
> > >> > +             sel_flow->stats.overlimit++;
> > >> > +             goto out;
> > >> > +     }
> > >>
> > >> The memory_limit checks in fq_codel have turned out to be quite useful
> > >> on constrained systems. I'd suggest adding them here as well.
> > >
> > > I shall add that too.
> > >
> > >>
> > >> > +
> > >> > +     if (!drop_early(sch, sel_flow->backlog, skb->len, &sel_flow->vars,
> > >> > +                     &q->params.p_params)) {
> > >> > +             enqueue = true;
> > >> > +     } else if (q->params.p_params.ecn &&
> > >> > +                sel_flow->vars.prob <=
> > >> > +                (MAX_PROB / 100) * q->params.ecn_prob &&
> > >> > +                INET_ECN_set_ce(skb)) {
> > >> > +             /* If packet is ecn capable, mark it if drop probability
> > >> > +              * is lower than the parameter ecn_prob, else drop it.
> > >> > +              */
> > >> > +             q->stats.ecn_mark++;
> > >> > +             sel_flow->stats.ecn_mark++;
> > >> > +             enqueue = true;
> > >> > +     }
> > >> > +     if (enqueue) {
> > >> > +             pkt_len = qdisc_pkt_len(skb);
> > >> > +             q->stats.packets_in++;
> > >> > +             sch->qstats.backlog += pkt_len;
> > >> > +             sch->q.qlen++;
> > >> > +             flow_queue_add(sel_flow, skb);
> > >> > +             if (list_empty(&sel_flow->flowchain)) {
> > >> > +                     list_add_tail(&sel_flow->flowchain, &q->new_flows);
> > >> > +                     q->stats.new_flow_count++;
> > >> > +                     sel_flow->deficit = q->quantum;
> > >> > +                     sel_flow->stats.dropped = 0;
> > >> > +                     sel_flow->qlen = 0;
> > >> > +                     sel_flow->backlog = 0;
> > >> > +             }
> > >> > +             sel_flow->qlen++;
> > >> > +             sel_flow->stats.packets_in++;
> > >> > +             sel_flow->backlog += pkt_len;
> > >> > +             return NET_XMIT_SUCCESS;
> > >> > +     }
> > >> > +out:
> > >> > +     q->stats.dropped++;
> > >> > +     sel_flow->stats.dropped++;
> > >> > +     return qdisc_drop(skb, sch, to_free);
> > >>
> > >> You probably want to return NET_XMIT_CN instead of NET_XMIT_DROP here.
> > >>
> > >
> > > Should I replace qdisc_drop with __qdisc_drop and return NET_XMIT_CN?
> >
> > Yeah, I think that would be more appropriate here, as that would
> > directly throttle sockets on the same host.
> >
> > >> > +}
> > >> > +
> > >> > +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
> > >> > +     [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
> > >> > +     [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}
> > >> > +};
> > >> > +
> > >> > +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
> > >> > +{
> > >> > +     struct sk_buff *skb = flow->head;
> > >> > +
> > >> > +     flow->head = skb->next;
> > >> > +     skb->next = NULL;
> > >> > +     return skb;
> > >> > +}
> > >> > +
> > >> > +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
> > >> > +{
> > >> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > >> > +     struct sk_buff *skb = NULL;
> > >> > +     struct fq_pie_flow *flow;
> > >> > +     struct list_head *head;
> > >> > +     u32 pkt_len;
> > >> > +
> > >> > +begin:
> > >> > +     head = &q->new_flows;
> > >> > +     if (list_empty(head)) {
> > >> > +             head = &q->old_flows;
> > >> > +             if (list_empty(head))
> > >> > +                     return NULL;
> > >> > +     }
> > >> > +
> > >> > +     flow = list_first_entry(head, struct fq_pie_flow, flowchain);
> > >> > +     /* Flow has exhausted all its credits */
> > >> > +     if (flow->deficit <= 0) {
> > >> > +             flow->deficit += q->quantum;
> > >> > +             list_move_tail(&flow->flowchain, &q->old_flows);
> > >> > +             goto begin;
> > >> > +     }
> > >> > +
> > >> > +     if (flow->head) {
> > >> > +             skb = dequeue_head(flow);
> > >> > +             pkt_len = qdisc_pkt_len(skb);
> > >> > +             sch->qstats.backlog -= pkt_len;
> > >> > +             sch->q.qlen--;
> > >> > +             qdisc_bstats_update(sch, skb);
> > >> > +     }
> > >>
> > >> If you factor this out into a dequeue_func(), this dequeue() function is
> > >> very close to identical to the one in fq_codel().
> > >
> > > Do you suggest I put the if(flow->head) block in a different function?
> >
> > Yeah, that was what I meant. However, without doing the full refactoring
> > it's only borderline useful, so feel free to just keep it the way it
> > is...
> >
> > -Toke
>
>
>
> --
> -------------
> Gautam |
Dave Taht April 8, 2019, 5:52 a.m. UTC | #6
On Mon, Apr 8, 2019 at 7:37 AM Dave Taht <dave.taht@gmail.com> wrote:
>
> On Mon, Apr 8, 2019 at 7:31 AM Gautam Ramakrishnan <gautamramk@gmail.com> wrote:
> >
> > I was trying to refactor the code and I ran into some issues.
> > 1. I moved some of the parameters such as flows_cnt into a new struct
> > called fq_pie_params, instead of keeping them in fq_pie_sched_data.
> > Should I move those parameters back into fq_pie_sched_data?
> > 2. fq_codel maintains the backlog variable as a list in the
> > fq_codel_sched_data, whereas I maintain a backlog in the struct
> > fq_pie_flow. What approach should I follow?
>
> Hmm. I had made some efforts to speed up fq_codel here:
> https://github.com/dtaht/fq_codel_fast
>
> based on ideas here:
> https://lists.bufferbloat.net/pipermail/codel/2018-August/002367.html
>
> Notably I wanted to get rid of the non O(1) bulk dropper search and
> keep the flow backlog closer to the flow stats.
> Needed oprofiling to see if it helped.

I'm sorry to conflate your fq_pie upstreaming attempt with my own long
out of tree improvements for fq_codel. I'd been meaning to upstream
multiple fixes from there for ages, but needed to profile them on mips
first.

Are you in a hurry? Because: hacking in SCE into pie and fq_pie, etc,
is WAY more interesting than mainlining stuff right now. :)

Our updated "some congestion experienced"  internet draft with
suggestions for pie and red is here:

https://github.com/dtaht/bufferbloat-rfcs/blob/master/sce/draft-morton-taht-tsvwg-sce.txt#L272

and running code for sce is in that fq_codel_fast repo and in the out
of tree sch_cake also:

https://github.com/dtaht/sch_cake/commit/755ba4cda56bec45d20f7928f1442fdf83c12573

> > 3. Would maintaining a per flow pie_stats be useful in the future? I
> > do not use the per flow stats anywhere in the code.
>
> In general I have not found per flow stats useful or accessible and it
> was one of the things I eliminated in fq_codel_fast
> > On Tue, Apr 2, 2019 at 10:55 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> > >
> > > Gautam Ramakrishnan <gautamramk@gmail.com> writes:
> > >
> > > > Hello, thanks for the feedback
> > > >
> > > > On Tue, Apr 2, 2019 at 4:19 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> > > >>
> > > >> Some suggestions below to make fq_pie and fq_codel more similar (ref. my
> > > >> previous email).
> > > >>
> > > >> Also, a few unrelated nits.
> > > >>
> > > >> > From: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> > > >> >
> > > >> > FQ-PIE incorporates fair/flow queuing in which every queue
> > > >> > is managed by an instance of PIE queueing discipline.
> > > >> > The algorithm provides good control over the queueing delay
> > > >> > while at the same time, ensures fairness across various
> > > >> > flows sharing the same link.
> > > >> >
> > > >> > Principles:
> > > >> >   - Packets are classified stochastically on flows.
> > > >> >   - Each flow has a PIE managed queue.
> > > >> >   - Flows are linked onto two (Round Robin) lists,
> > > >> >     so that new flows have priority on old ones.
> > > >> >   - For a given flow, packets are dropped on arrival according
> > > >> >     to a drop probability.
> > > >> >   - Tail drops only.
> > > >>
> > > >> Why tail drops only?
> > > >
> > > > I had mentioned this because packets are dropped only at enqueue.
> > >
> > > Yup, realise that; was just wondering why you went with this design
> > > instead of doing the head drop that fq_codel does?
> > >
> > > >>
> > > >> > Usage:
> > > >> > tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ]
> > > >> >                     [ alpha NUMBER ] [ beta NUMBER ]
> > > >> >                     [ target TIME us ] [ tupdate TIME us ]
> > > >> >                   [ bytemode ] [ quantum BYTES ]
> > > >> >                   [ ecn | ecn_prob PERCENTAGE ]
> > > >> >
> > > >> > defaults: 1024 flows, 10240 packets limit, quantum : device MTU
> > > >> >            target: 15ms
> > > >> >            tupdate: 15ms
> > > >> >            alpha: 2 (on a scale of 0 to 16)
> > > >> >            beta: 20 (on a scale of 0 to 16)
> > > >> >            ecn: false
> > > >> >            ecn_prob: 10%
> > > >> >
> > > >> > Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> > > >> > Signed-off-by: Sachin D. Patil <sdp.sachin@gmail.com>
> > > >> > Signed-off-by: Mohit Bhasi <mohitbhasi1998@gmail.com>
> > > >> > Signed-off-by: V. Saicharan <vsaicharan1998@gmail.com>
> > > >> > Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
> > > >> > Signed-off-by: Gautam Ramakrishnan <gautamramk@gmail.com>
> > > >> > Cc: Dave Taht <dave.taht@gmail.com>
> > > >> > ---
> > > >> >  include/uapi/linux/pkt_sched.h |  28 ++
> > > >> >  net/sched/Kconfig              |  14 +-
> > > >> >  net/sched/Makefile             |   1 +
> > > >> >  net/sched/sch_fq_pie.c         | 485 +++++++++++++++++++++++++++++++++
> > > >> >  4 files changed, 527 insertions(+), 1 deletion(-)
> > > >> >  create mode 100644 net/sched/sch_fq_pie.c
> > > >> >
> > > >> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> > > >> > index 7ee74c3474bf..005413bd09ee 100644
> > > >> > --- a/include/uapi/linux/pkt_sched.h
> > > >> > +++ b/include/uapi/linux/pkt_sched.h
> > > >> > @@ -964,6 +964,34 @@ struct tc_pie_xstats {
> > > >> >       __u32 ecn_mark;         /* packets marked with ecn*/
> > > >> >  };
> > > >> >
> > > >> > +/* FQ PIE */
> > > >> > +enum {
> > > >> > +     TCA_FQ_PIE_UNSPEC,
> > > >> > +     TCA_FQ_PIE_TARGET,
> > > >> > +     TCA_FQ_PIE_LIMIT,
> > > >> > +     TCA_FQ_PIE_TUPDATE,
> > > >> > +     TCA_FQ_PIE_ALPHA,
> > > >> > +     TCA_FQ_PIE_BETA,
> > > >> > +     TCA_FQ_PIE_ECN,
> > > >> > +     TCA_FQ_PIE_QUANTUM,
> > > >> > +     TCA_FQ_PIE_BYTEMODE,
> > > >> > +     TCA_FQ_PIE_FLOWS,
> > > >> > +     TCA_FQ_PIE_ECN_PROB,
> > > >> > +     __TCA_FQ_PIE_MAX
> > > >> > +};
> > > >> > +#define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
> > > >> > +
> > > >> > +struct tc_fq_pie_xstats {
> > > >> > +     __u32 packets_in;       /* total number of packets enqueued */
> > > >> > +     __u32 dropped;          /* packets dropped due to fq_pie_action */
> > > >> > +     __u32 overlimit;        /* dropped due to lack of space in queue */
> > > >> > +     __u32 ecn_mark;         /* packets marked with ecn*/
> > > >> > +     __u32 new_flow_count;   /* number of time packets
> > > >> > +                                created a 'new flow' */
> > > >> > +     __u32 new_flows_len;    /* count of flows in new list */
> > > >> > +     __u32 old_flows_len;    /* count of flows in old list */
> > > >> > +};
> > > >> > +
> > > >> >  /* CBS */
> > > >> >  struct tc_cbs_qopt {
> > > >> >       __u8 offload;
> > > >> > diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> > > >> > index 5c02ad97ef23..49f4dd9894a0 100644
> > > >> > --- a/net/sched/Kconfig
> > > >> > +++ b/net/sched/Kconfig
> > > >> > @@ -358,13 +358,25 @@ config NET_SCH_PIE
> > > >> >       help
> > > >> >         Say Y here if you want to use the Proportional Integral controller
> > > >> >         Enhanced scheduler packet scheduling algorithm.
> > > >> > -       For more information, please see https://tools.ietf.org/html/rfc8033
> > > >> > +       For more information, please see
> > > >> > +       http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
> > > >> >
> > > >> >         To compile this driver as a module, choose M here: the module
> > > >> >         will be called sch_pie.
> > > >> >
> > > >> >         If unsure, say N.
> > > >> >
> > > >> > +config NET_SCH_FQ_PIE
> > > >> > +     tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler"
> > > >> > +     help
> > > >> > +       Say Y here if you want to use the Flow Queue Proportional Integral controller
> > > >> > +       Enhanced scheduler packet scheduling algorithm.
> > > >> > +
> > > >> > +       To compile this driver as a module, choose M here: the module
> > > >> > +       will be called sch_fq_pie.
> > > >> > +
> > > >> > +       If unsure, say N.
> > > >> > +
> > > >> >  config NET_SCH_INGRESS
> > > >> >       tristate "Ingress/classifier-action Qdisc"
> > > >> >       depends on NET_CLS_ACT
> > > >> > diff --git a/net/sched/Makefile b/net/sched/Makefile
> > > >> > index 8a40431d7b5c..fdcd3f7b2fb2 100644
> > > >> > --- a/net/sched/Makefile
> > > >> > +++ b/net/sched/Makefile
> > > >> > @@ -55,6 +55,7 @@ obj-$(CONFIG_NET_SCH_CAKE)  += sch_cake.o
> > > >> >  obj-$(CONFIG_NET_SCH_FQ)     += sch_fq.o
> > > >> >  obj-$(CONFIG_NET_SCH_HHF)    += sch_hhf.o
> > > >> >  obj-$(CONFIG_NET_SCH_PIE)    += sch_pie.o
> > > >> > +obj-$(CONFIG_NET_SCH_FQ_PIE)    += sch_fq_pie.o
> > > >> >  obj-$(CONFIG_NET_SCH_CBS)    += sch_cbs.o
> > > >> >  obj-$(CONFIG_NET_SCH_ETF)    += sch_etf.o
> > > >> >  obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
> > > >> > diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
> > > >> > new file mode 100644
> > > >> > index 000000000000..4ccefa0bc7f0
> > > >> > --- /dev/null
> > > >> > +++ b/net/sched/sch_fq_pie.c
> > > >> > @@ -0,0 +1,485 @@
> > > >> > +/*
> > > >> > + * net/sched/sch_fq_pie.c
> > > >> > + *
> > > >> > + * This program is free software; you can redistribute it and/or
> > > >> > + * modify it under the terms of the GNU General Public License
> > > >> > + * as published by the Free Software Foundation; either version 2
> > > >> > + * of the License.
> > > >> > + *
> > > >> > + * This program is distributed in the hope that it will be useful,
> > > >> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > >> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > >> > + * GNU General Public License for more details.
> > > >>
> > > >> Lose the license boilerplate and replace it with a SPDX header line.
> > > >
> > > > I shall do that.
> > > >
> > > >>
> > > >> > + * Author: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
> > > >> > + * Author: Sachin D. Patil <sdp.sachin@gmail.com>
> > > >> > + * Author: Mohit Bhasi <mohitbhasi1998@gmail.com>
> > > >> > + * Author: V Saicharan <vsaicharan1998@gmail.com>
> > > >> > + * Author: Leslie Monis <lesliemonis@gmail.com>
> > > >> > + * Author: Gautam Ramakrishnan <gautamramk@gmail.com>
> > > >> > + *
> > > >> > + * References:
> > > >> > + * RFC 8033: https://tools.ietf.org/html/rfc8033
> > > >> > + */
> > > >> > +
> > > >> > +#include <linux/jhash.h>
> > > >> > +#include <linux/vmalloc.h>
> > > >> > +#include <net/pie.h>
> > > >> > +
> > > >> > +struct fq_pie_params {
> > > >> > +     struct pie_params p_params;
> > > >> > +     u32 ecn_prob;
> > > >> > +     u32 flows_cnt;
> > > >> > +};
> > > >> > +
> > > >> > +struct fq_pie_stats {
> > > >> > +     u32 packets_in;         /* total number of packets enqueued */
> > > >> > +     u32 dropped;            /* packets dropped due to fq_pie action */
> > > >> > +     u32 overlimit;          /* dropped due to lack of space in queue */
> > > >> > +     u32 ecn_mark;           /* packets marked with ECN */
> > > >> > +     u32 new_flow_count;     /* number of time packets created a new flow */
> > > >> > +};
> > > >> > +
> > > >> > +struct fq_pie_flow {
> > > >> > +     s32 deficit;            /* number of credits remaining for the flow */
> > > >> > +     u32 backlog;            /* size of data in the flow */
> > > >> > +     u32 qlen;               /* number of packets in the flow */
> > > >> > +     struct sk_buff *head;
> > > >> > +     struct sk_buff *tail;
> > > >> > +     struct list_head flowchain;
> > > >> > +     struct pie_vars vars;   /* pie vars for the flow */
> > > >> > +     struct pie_stats stats; /* pie stats for the flow */
> > > >> > +};
> > > >> > +
> > > >> > +struct fq_pie_sched_data {
> > > >> > +     u32 quantum;            /* number of credits in deficit round robin */
> > > >> > +     struct fq_pie_flow *flows;
> > > >> > +     struct Qdisc *sch;
> > > >> > +     struct fq_pie_params params;
> > > >> > +     struct fq_pie_stats stats;
> > > >> > +     struct list_head old_flows;
> > > >> > +     struct list_head new_flows;
> > > >> > +     struct timer_list adapt_timer;
> > > >> > +};
> > > >>
> > > >> The flow and sched_data structs have quite a bit in common with those in
> > > >> fq_codel; but the members are in a different order.
> > > >>
> > > >> > +static void fq_pie_params_init(struct fq_pie_params *params)
> > > >> > +{
> > > >> > +     pie_params_init(&params->p_params);
> > > >> > +     params->ecn_prob = 10;
> > > >> > +     params->flows_cnt = 1024;
> > > >> > +}
> > > >> > +
> > > >> > +static inline void flow_queue_add(struct fq_pie_flow *flow,
> > > >> > +                               struct sk_buff *skb)
> > > >> > +{
> > > >> > +     if (!flow->head)
> > > >> > +             flow->head = skb;
> > > >> > +     else
> > > >> > +             flow->tail->next = skb;
> > > >> > +     flow->tail = skb;
> > > >> > +     skb->next = NULL;
> > > >> > +}
> > > >> > +
> > > >> > +static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> > > >> > +                             struct sk_buff **to_free)
> > > >> > +{
> > > >> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > > >> > +     struct fq_pie_flow *sel_flow;
> > > >> > +     u32 pkt_len;
> > > >> > +     u32 idx;
> > > >> > +     u8 enqueue = false;
> > > >> > +
> > > >> > +     /* Classifies packet into corresponding flow */
> > > >> > +     idx = reciprocal_scale(skb_get_hash(skb), q->params.flows_cnt);
> > > >> > +     sel_flow = &q->flows[idx];
> > > >>
> > > >> This is missing the ability to override the classification from tc
> > > >> filters. See fq_codel_classify().
> > > >
> > > > I wanted to keep it simple initially. Shall add that.
> > > >
> > > >>
> > > >> > +
> > > >> > +     /* Checks if the qdisc is full */
> > > >> > +     if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
> > > >> > +             q->stats.overlimit++;
> > > >> > +             sel_flow->stats.overlimit++;
> > > >> > +             goto out;
> > > >> > +     }
> > > >>
> > > >> The memory_limit checks in fq_codel have turned out to be quite useful
> > > >> on constrained systems. I'd suggest adding them here as well.
> > > >
> > > > I shall add that too.
> > > >
> > > >>
> > > >> > +
> > > >> > +     if (!drop_early(sch, sel_flow->backlog, skb->len, &sel_flow->vars,
> > > >> > +                     &q->params.p_params)) {
> > > >> > +             enqueue = true;
> > > >> > +     } else if (q->params.p_params.ecn &&
> > > >> > +                sel_flow->vars.prob <=
> > > >> > +                (MAX_PROB / 100) * q->params.ecn_prob &&
> > > >> > +                INET_ECN_set_ce(skb)) {
> > > >> > +             /* If packet is ecn capable, mark it if drop probability
> > > >> > +              * is lower than the parameter ecn_prob, else drop it.
> > > >> > +              */
> > > >> > +             q->stats.ecn_mark++;
> > > >> > +             sel_flow->stats.ecn_mark++;
> > > >> > +             enqueue = true;
> > > >> > +     }
> > > >> > +     if (enqueue) {
> > > >> > +             pkt_len = qdisc_pkt_len(skb);
> > > >> > +             q->stats.packets_in++;
> > > >> > +             sch->qstats.backlog += pkt_len;
> > > >> > +             sch->q.qlen++;
> > > >> > +             flow_queue_add(sel_flow, skb);
> > > >> > +             if (list_empty(&sel_flow->flowchain)) {
> > > >> > +                     list_add_tail(&sel_flow->flowchain, &q->new_flows);
> > > >> > +                     q->stats.new_flow_count++;
> > > >> > +                     sel_flow->deficit = q->quantum;
> > > >> > +                     sel_flow->stats.dropped = 0;
> > > >> > +                     sel_flow->qlen = 0;
> > > >> > +                     sel_flow->backlog = 0;
> > > >> > +             }
> > > >> > +             sel_flow->qlen++;
> > > >> > +             sel_flow->stats.packets_in++;
> > > >> > +             sel_flow->backlog += pkt_len;
> > > >> > +             return NET_XMIT_SUCCESS;
> > > >> > +     }
> > > >> > +out:
> > > >> > +     q->stats.dropped++;
> > > >> > +     sel_flow->stats.dropped++;
> > > >> > +     return qdisc_drop(skb, sch, to_free);
> > > >>
> > > >> You probably want to return NET_XMIT_CN instead of NET_XMIT_DROP here.
> > > >>
> > > >
> > > > Should I replace qdisc_drop with __qdisc_drop and return NET_XMIT_CN?
> > >
> > > Yeah, I think that would be more appropriate here, as that would
> > > directly throttle sockets on the same host.
> > >
> > > >> > +}
> > > >> > +
> > > >> > +static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
> > > >> > +     [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
> > > >> > +     [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}
> > > >> > +};
> > > >> > +
> > > >> > +static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
> > > >> > +{
> > > >> > +     struct sk_buff *skb = flow->head;
> > > >> > +
> > > >> > +     flow->head = skb->next;
> > > >> > +     skb->next = NULL;
> > > >> > +     return skb;
> > > >> > +}
> > > >> > +
> > > >> > +static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
> > > >> > +{
> > > >> > +     struct fq_pie_sched_data *q = qdisc_priv(sch);
> > > >> > +     struct sk_buff *skb = NULL;
> > > >> > +     struct fq_pie_flow *flow;
> > > >> > +     struct list_head *head;
> > > >> > +     u32 pkt_len;
> > > >> > +
> > > >> > +begin:
> > > >> > +     head = &q->new_flows;
> > > >> > +     if (list_empty(head)) {
> > > >> > +             head = &q->old_flows;
> > > >> > +             if (list_empty(head))
> > > >> > +                     return NULL;
> > > >> > +     }
> > > >> > +
> > > >> > +     flow = list_first_entry(head, struct fq_pie_flow, flowchain);
> > > >> > +     /* Flow has exhausted all its credits */
> > > >> > +     if (flow->deficit <= 0) {
> > > >> > +             flow->deficit += q->quantum;
> > > >> > +             list_move_tail(&flow->flowchain, &q->old_flows);
> > > >> > +             goto begin;
> > > >> > +     }
> > > >> > +
> > > >> > +     if (flow->head) {
> > > >> > +             skb = dequeue_head(flow);
> > > >> > +             pkt_len = qdisc_pkt_len(skb);
> > > >> > +             sch->qstats.backlog -= pkt_len;
> > > >> > +             sch->q.qlen--;
> > > >> > +             qdisc_bstats_update(sch, skb);
> > > >> > +     }
> > > >>
> > > >> If you factor this out into a dequeue_func(), this dequeue() function is
> > > >> very close to identical to the one in fq_codel().
> > > >
> > > > Do you suggest I put the if(flow->head) block in a different function?
> > >
> > > Yeah, that was what I meant. However, without doing the full refactoring
> > > it's only borderline useful, so feel free to just keep it the way it
> > > is...
> > >
> > > -Toke
> >
> >
> >
> > --
> > -------------
> > Gautam |
>
>
>
> --
>
> Dave Täht
> CTO, TekLibre, LLC
> http://www.teklibre.com
> Tel: 1-831-205-9740
Toke Høiland-Jørgensen April 8, 2019, 8:54 a.m. UTC | #7
Gautam Ramakrishnan <gautamramk@gmail.com> writes:

> I was trying to refactor the code and I ran into some issues.
> 1. I moved some of the parameters such as flows_cnt into a new struct
> called fq_pie_params, instead of keeping them in fq_pie_sched_data.
> Should I move those parameters back into fq_pie_sched_data?

Hmm, this is getting into taste here, but I think I would just put them
directly into fq_pie_sched_data; after all, you are not reusing them in
multiple places, and things like 'q->params.p_params.target' is a bit
much for my taste :)

> 2. fq_codel maintains the backlog variable as a list in the
> fq_codel_sched_data, whereas I maintain a backlog in the struct
> fq_pie_flow. What approach should I follow?

The reason fq_codel does this is to keep that memory contiguous, so that
when it loops through it all (in fq_codel_drop()), it is less memory
that has to be loaded into cache. Since you're not doing that in fq_pie,
it's not strictly necessary to mirror this (admittedly, somewhat
confusing) aspect of fq_codel :)

> 3. Would maintaining a per flow pie_stats be useful in the future? I
> do not use the per flow stats anywhere in the code.

Hard to know what is useful in the future. Is it useful now? It's easier
to add new features later on than to remove them once they are there...
So if you don't have an immediate use case for it, my suggestion would
be to remove it, and add it back if someone really ends up needing it :)

-Toke
diff mbox series

Patch

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 7ee74c3474bf..005413bd09ee 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -964,6 +964,34 @@  struct tc_pie_xstats {
 	__u32 ecn_mark;         /* packets marked with ecn*/
 };
 
+/* FQ PIE */
+enum {
+	TCA_FQ_PIE_UNSPEC,
+	TCA_FQ_PIE_TARGET,
+	TCA_FQ_PIE_LIMIT,
+	TCA_FQ_PIE_TUPDATE,
+	TCA_FQ_PIE_ALPHA,
+	TCA_FQ_PIE_BETA,
+	TCA_FQ_PIE_ECN,
+	TCA_FQ_PIE_QUANTUM,
+	TCA_FQ_PIE_BYTEMODE,
+	TCA_FQ_PIE_FLOWS,
+	TCA_FQ_PIE_ECN_PROB,
+	__TCA_FQ_PIE_MAX
+};
+#define TCA_FQ_PIE_MAX   (__TCA_FQ_PIE_MAX - 1)
+
+struct tc_fq_pie_xstats {
+	__u32 packets_in;       /* total number of packets enqueued */
+	__u32 dropped;          /* packets dropped due to fq_pie_action */
+	__u32 overlimit;        /* dropped due to lack of space in queue */
+	__u32 ecn_mark;         /* packets marked with ecn*/
+	__u32 new_flow_count;   /* number of time packets
+				   created a 'new flow' */
+	__u32 new_flows_len;	/* count of flows in new list */
+	__u32 old_flows_len;	/* count of flows in old list */
+};
+
 /* CBS */
 struct tc_cbs_qopt {
 	__u8 offload;
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 5c02ad97ef23..49f4dd9894a0 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -358,13 +358,25 @@  config NET_SCH_PIE
 	help
 	  Say Y here if you want to use the Proportional Integral controller
 	  Enhanced scheduler packet scheduling algorithm.
-	  For more information, please see https://tools.ietf.org/html/rfc8033
+	  For more information, please see
+	  http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called sch_pie.
 
 	  If unsure, say N.
 
+config NET_SCH_FQ_PIE
+	tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler"
+	help
+	  Say Y here if you want to use the Flow Queue Proportional Integral controller
+	  Enhanced scheduler packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_fq_pie.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress/classifier-action Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c..fdcd3f7b2fb2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -55,6 +55,7 @@  obj-$(CONFIG_NET_SCH_CAKE)	+= sch_cake.o
 obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)	+= sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)	+= sch_pie.o
+obj-$(CONFIG_NET_SCH_FQ_PIE)    += sch_fq_pie.o
 obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 obj-$(CONFIG_NET_SCH_ETF)	+= sch_etf.o
 obj-$(CONFIG_NET_SCH_TAPRIO)	+= sch_taprio.o
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
new file mode 100644
index 000000000000..4ccefa0bc7f0
--- /dev/null
+++ b/net/sched/sch_fq_pie.c
@@ -0,0 +1,485 @@ 
+/*
+ * net/sched/sch_fq_pie.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Author: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
+ * Author: Sachin D. Patil <sdp.sachin@gmail.com>
+ * Author: Mohit Bhasi <mohitbhasi1998@gmail.com>
+ * Author: V Saicharan <vsaicharan1998@gmail.com>
+ * Author: Leslie Monis <lesliemonis@gmail.com>
+ * Author: Gautam Ramakrishnan <gautamramk@gmail.com>
+ *
+ * References:
+ * RFC 8033: https://tools.ietf.org/html/rfc8033
+ */
+
+#include <linux/jhash.h>
+#include <linux/vmalloc.h>
+#include <net/pie.h>
+
+struct fq_pie_params {
+	struct pie_params p_params;
+	u32 ecn_prob;
+	u32 flows_cnt;
+};
+
+struct fq_pie_stats {
+	u32 packets_in;		/* total number of packets enqueued */
+	u32 dropped;		/* packets dropped due to fq_pie action */
+	u32 overlimit;		/* dropped due to lack of space in queue */
+	u32 ecn_mark;		/* packets marked with ECN */
+	u32 new_flow_count;	/* number of time packets created a new flow */
+};
+
+struct fq_pie_flow {
+	s32 deficit;		/* number of credits remaining for the flow */
+	u32 backlog;		/* size of data in the flow */
+	u32 qlen;		/* number of packets in the flow */
+	struct sk_buff *head;
+	struct sk_buff *tail;
+	struct list_head flowchain;
+	struct pie_vars vars;	/* pie vars for the flow */
+	struct pie_stats stats;	/* pie stats for the flow */
+};
+
+struct fq_pie_sched_data {
+	u32 quantum;		/* number of credits in deficit round robin */
+	struct fq_pie_flow *flows;
+	struct Qdisc *sch;
+	struct fq_pie_params params;
+	struct fq_pie_stats stats;
+	struct list_head old_flows;
+	struct list_head new_flows;
+	struct timer_list adapt_timer;
+};
+
+static void fq_pie_params_init(struct fq_pie_params *params)
+{
+	pie_params_init(&params->p_params);
+	params->ecn_prob = 10;
+	params->flows_cnt = 1024;
+}
+
+static inline void flow_queue_add(struct fq_pie_flow *flow,
+				  struct sk_buff *skb)
+{
+	if (!flow->head)
+		flow->head = skb;
+	else
+		flow->tail->next = skb;
+	flow->tail = skb;
+	skb->next = NULL;
+}
+
+static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+				struct sk_buff **to_free)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	struct fq_pie_flow *sel_flow;
+	u32 pkt_len;
+	u32 idx;
+	u8 enqueue = false;
+
+	/* Classifies packet into corresponding flow */
+	idx = reciprocal_scale(skb_get_hash(skb), q->params.flows_cnt);
+	sel_flow = &q->flows[idx];
+
+	/* Checks if the qdisc is full */
+	if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+		q->stats.overlimit++;
+		sel_flow->stats.overlimit++;
+		goto out;
+	}
+
+	if (!drop_early(sch, sel_flow->backlog, skb->len, &sel_flow->vars,
+			&q->params.p_params)) {
+		enqueue = true;
+	} else if (q->params.p_params.ecn &&
+		   sel_flow->vars.prob <=
+		   (MAX_PROB / 100) * q->params.ecn_prob &&
+		   INET_ECN_set_ce(skb)) {
+		/* If packet is ecn capable, mark it if drop probability
+		 * is lower than the parameter ecn_prob, else drop it.
+		 */
+		q->stats.ecn_mark++;
+		sel_flow->stats.ecn_mark++;
+		enqueue = true;
+	}
+	if (enqueue) {
+		pkt_len = qdisc_pkt_len(skb);
+		q->stats.packets_in++;
+		sch->qstats.backlog += pkt_len;
+		sch->q.qlen++;
+		flow_queue_add(sel_flow, skb);
+		if (list_empty(&sel_flow->flowchain)) {
+			list_add_tail(&sel_flow->flowchain, &q->new_flows);
+			q->stats.new_flow_count++;
+			sel_flow->deficit = q->quantum;
+			sel_flow->stats.dropped = 0;
+			sel_flow->qlen = 0;
+			sel_flow->backlog = 0;
+		}
+		sel_flow->qlen++;
+		sel_flow->stats.packets_in++;
+		sel_flow->backlog += pkt_len;
+		return NET_XMIT_SUCCESS;
+	}
+out:
+	q->stats.dropped++;
+	sel_flow->stats.dropped++;
+	return qdisc_drop(skb, sch, to_free);
+}
+
+static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
+	[TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
+	[TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
+	[TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
+	[TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
+	[TCA_FQ_PIE_BETA] = {.type = NLA_U32},
+	[TCA_FQ_PIE_ECN] = {.type = NLA_U32},
+	[TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
+	[TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
+	[TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
+	[TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32}
+};
+
+static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	flow->head = skb->next;
+	skb->next = NULL;
+	return skb;
+}
+
+static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = NULL;
+	struct fq_pie_flow *flow;
+	struct list_head *head;
+	u32 pkt_len;
+
+begin:
+	head = &q->new_flows;
+	if (list_empty(head)) {
+		head = &q->old_flows;
+		if (list_empty(head))
+			return NULL;
+	}
+
+	flow = list_first_entry(head, struct fq_pie_flow, flowchain);
+	/* Flow has exhausted all its credits */
+	if (flow->deficit <= 0) {
+		flow->deficit += q->quantum;
+		list_move_tail(&flow->flowchain, &q->old_flows);
+		goto begin;
+	}
+
+	if (flow->head) {
+		skb = dequeue_head(flow);
+		pkt_len = qdisc_pkt_len(skb);
+		sch->qstats.backlog -= pkt_len;
+		sch->q.qlen--;
+		qdisc_bstats_update(sch, skb);
+	}
+
+	if (!skb) {
+		if (head == &q->new_flows && !list_empty(&q->old_flows))
+			list_move_tail(&flow->flowchain, &q->old_flows);
+		else
+			list_del_init(&flow->flowchain);
+		goto begin;
+	}
+
+	flow->qlen--;
+	flow->deficit -= pkt_len;
+	flow->backlog -= pkt_len;
+	pie_process_dequeue(flow->backlog, skb, &flow->vars);
+	return skb;
+}
+
+static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
+			 struct netlink_ext_ack *extack)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
+	unsigned int len_dropped = 0;
+	unsigned int num_dropped = 0;
+	unsigned int qlen;
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_FQ_PIE_FLOWS]) {
+		if (q->flows)
+			return -EINVAL;
+		q->params.flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
+		if (!q->params.flows_cnt ||
+		    q->params.flows_cnt > 65536)
+			return -EINVAL;
+	}
+
+	sch_tree_lock(sch);
+
+	/* convert from microseconds to pschedtime */
+	if (tb[TCA_FQ_PIE_TARGET]) {
+		/* target is in us */
+		u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
+
+		/* convert to pschedtime */
+		q->params.p_params.target =
+		PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+	}
+
+	/* tupdate is in jiffies */
+	if (tb[TCA_FQ_PIE_TUPDATE])
+		q->params.p_params.tupdate =
+		usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
+
+	if (tb[TCA_FQ_PIE_LIMIT]) {
+		u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
+
+		q->params.p_params.limit = limit;
+		sch->limit = limit;
+	}
+
+	if (tb[TCA_FQ_PIE_ECN_PROB])
+		q->params.ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
+
+	if (tb[TCA_FQ_PIE_ALPHA])
+		q->params.p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
+
+	if (tb[TCA_FQ_PIE_BETA])
+		q->params.p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
+
+	if (tb[TCA_FQ_PIE_ECN])
+		q->params.p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
+
+	if (tb[TCA_FQ_PIE_QUANTUM])
+		q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
+
+	if (tb[TCA_FQ_PIE_BYTEMODE])
+		q->params.p_params.bytemode =
+		nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
+
+	/* Drop excess packets if new limit is lower */
+	qlen = sch->q.qlen;
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
+
+		kfree_skb(skb);
+		len_dropped += qdisc_pkt_len(skb);
+		num_dropped += 1;
+	}
+	qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static void fq_pie_timer(struct timer_list *t)
+{
+	struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
+	struct Qdisc *sch = q->sch;
+	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+	u16 idx;
+
+	spin_lock(root_lock);
+
+	for (idx = 0; idx < q->params.flows_cnt; idx++)
+		calculate_probability(q->flows[idx].backlog,
+				      &q->flows[idx].vars,
+				      &q->params.p_params);
+
+	// reset the timer to fire after 'tupdate'. tupdate is in jiffies.
+	if (q->params.p_params.tupdate)
+		mod_timer(&q->adapt_timer,
+			  jiffies + q->params.p_params.tupdate);
+	spin_unlock(root_lock);
+}
+
+static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	int err;
+	int i;
+
+	fq_pie_params_init(&q->params);
+	sch->limit = 10 * 1024;
+	q->params.p_params.limit = sch->limit;
+	q->quantum = psched_mtu(qdisc_dev(sch));
+	q->sch = sch;
+
+	INIT_LIST_HEAD(&q->new_flows);
+	INIT_LIST_HEAD(&q->old_flows);
+
+	timer_setup(&q->adapt_timer, fq_pie_timer, 0);
+	mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+
+	if (opt) {
+		int err = fq_pie_change(sch, opt, extack);
+
+		if (err)
+			return err;
+	}
+
+	if (!q->flows) {
+		q->flows = kvcalloc(q->params.flows_cnt,
+				    sizeof(struct fq_pie_flow),
+				    GFP_KERNEL);
+		if (!q->flows) {
+			err = -ENOMEM;
+			goto init_failure;
+		}
+		for (i = 0; i < q->params.flows_cnt; i++) {
+			struct fq_pie_flow *flow = q->flows + i;
+
+			INIT_LIST_HEAD(&flow->flowchain);
+			pie_vars_init(&flow->vars);
+		}
+	}
+	return 0;
+
+init_failure:
+	q->params.flows_cnt = 0;
+
+	return err;
+}
+
+static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (!opts)
+		goto nla_put_failure;
+
+	/* convert target from pschedtime to us */
+	if (nla_put_u32(skb, TCA_FQ_PIE_TARGET,
+			((u32)PSCHED_TICKS2NS(q->params.p_params.target)) /
+			NSEC_PER_USEC) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
+			jiffies_to_usecs(q->params.p_params.tupdate)) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->params.p_params.alpha) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_BETA, q->params.p_params.beta) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_ECN, q->params.p_params.ecn) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE,
+			q->params.p_params.bytemode) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->params.flows_cnt) ||
+	    nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->params.ecn_prob))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	return -1;
+}
+
+static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	struct tc_fq_pie_xstats st = {
+		.packets_in	= q->stats.packets_in,
+		.overlimit	= q->stats.overlimit,
+		.dropped	= q->stats.dropped,
+		.ecn_mark	= q->stats.ecn_mark,
+		.new_flow_count = q->stats.new_flow_count,
+	};
+	struct list_head *pos;
+
+	sch_tree_lock(sch);
+	list_for_each(pos, &q->new_flows)
+		st.new_flows_len++;
+
+	list_for_each(pos, &q->old_flows)
+		st.old_flows_len++;
+	sch_tree_unlock(sch);
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void fq_pie_reset(struct Qdisc *sch)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+	u16 idx;
+
+	INIT_LIST_HEAD(&q->new_flows);
+	INIT_LIST_HEAD(&q->old_flows);
+	for (idx = 0; idx < q->params.flows_cnt; idx++) {
+		struct fq_pie_flow *flow = q->flows + idx;
+
+		/* Removes all packets from flow */
+		rtnl_kfree_skbs(flow->head, flow->tail);
+		flow->head = NULL;
+
+		INIT_LIST_HEAD(&flow->flowchain);
+		pie_vars_init(&flow->vars);
+	}
+
+	sch->q.qlen = 0;
+	sch->qstats.backlog = 0;
+}
+
+static void fq_pie_destroy(struct Qdisc *sch)
+{
+	struct fq_pie_sched_data *q = qdisc_priv(sch);
+
+	kvfree(q->flows);
+	del_timer_sync(&q->adapt_timer);
+}
+
+static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
+	.id = "fq_pie",
+	.priv_size	= sizeof(struct fq_pie_sched_data),
+	.enqueue	= fq_pie_qdisc_enqueue,
+	.dequeue	= fq_pie_qdisc_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.init		= fq_pie_init,
+	.destroy	= fq_pie_destroy,
+	.reset		= fq_pie_reset,
+	.change		= fq_pie_change,
+	.dump		= fq_pie_dump,
+	.dump_stats	= fq_pie_dump_stats,
+	.owner		= THIS_MODULE,
+};
+
+static int __init fq_pie_module_init(void)
+{
+	return register_qdisc(&fq_pie_qdisc_ops);
+}
+
+static void __exit fq_pie_module_exit(void)
+{
+	unregister_qdisc(&fq_pie_qdisc_ops);
+}
+
+module_init(fq_pie_module_init);
+module_exit(fq_pie_module_exit);
+
+MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE) scheduler");
+MODULE_AUTHOR("Mohit P. Tahiliani");
+MODULE_AUTHOR("Sachin D. Patil");
+MODULE_AUTHOR("Mohit Bhasi");
+MODULE_AUTHOR("V Saicharan");
+MODULE_AUTHOR("Leslie Monis");
+MODULE_AUTHOR("Gautam Ramakrishnan");
+MODULE_LICENSE("GPL");