diff mbox

[V2] net/sched: sch_plug - Queue traffic until an explicit release command

Message ID 1327823619-29274-1-git-send-email-rshriram@cs.ubc.ca
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

rshriram@cs.ubc.ca Jan. 29, 2012, 7:53 a.m. UTC
This qdisc can be used to implement output buffering, an essential
functionality required for consistent recovery in checkpoint based
fault tolerance systems. The qdisc supports two operations - plug and
unplug. When the qdisc receives a plug command via netlink request,
packets arriving henceforth are buffered until a corresponding unplug
command is received.

Its intention is to support speculative execution by allowing generated
network traffic to be rolled back. It is used to provide network
protection for domUs in the Remus high availability project, available as
part of Xen. This module is generic enough to be used by any other
system that wishes to add speculative execution and output buffering to
its applications.

This module was originally available in the linux 2.6.32 PV-OPS tree,
used as dom0 for Xen.

For more information, please refer to http://nss.cs.ubc.ca/remus/
and http://wiki.xensource.com/xenwiki/Remus

Changes since previous version:
 * Removed the hardcoded queue size
 * Removed special code to deal with fragmented skbs.

Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca>
[shriram - ported the code from older 2.6.32 to current tree]
---
 include/linux/pkt_sched.h |   16 +++++
 net/sched/Kconfig         |   19 +++++
 net/sched/Makefile        |    1 +
 net/sched/sch_plug.c      |  161 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 197 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_plug.c

Comments

Hagen Paul Pfeifer Jan. 29, 2012, 5:25 p.m. UTC | #1
* Shriram Rajagopalan | 2012-01-28 23:53:39 [-0800]:

>diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
>new file mode 100644
>index 0000000..d194cd2
>--- /dev/null
>+++ b/net/sched/sch_plug.c
>@@ -0,0 +1,161 @@
>+/*
>+ * sch_plug.c Queue traffic until an explicit release command
>+ *
>+ *             This program is free software; you can redistribute it and/or
>+ *             modify it under the terms of the GNU General Public License
>+ *             as published by the Free Software Foundation; either version
>+ *             2 of the License, or (at your option) any later version.
>+ *
>+ * The operation of the buffer is as follows:
>+ * When a checkpoint begins, a plug is inserted into the
>+ *   network queue by a netlink request (it operates by storing
>+ *   a pointer to the next packet which arrives and blocking dequeue
>+ *   when that packet is at the head of the queue).
>+ * When a checkpoint completes (the backup acknowledges receipt),
>+ *   currently-queued packets are released.
>+ * So it supports two operations, plug and unplug.
>+ */
>+
>+#include <linux/module.h>
>+#include <linux/types.h>
>+#include <linux/kernel.h>
>+#include <linux/errno.h>
>+#include <linux/netdevice.h>
>+#include <linux/skbuff.h>
>+#include <net/pkt_sched.h>
>+
>+struct plug_sched_data {
>+	u32 limit;
>+	/*
>+	 * stop points to the first packet which should not be
>+	 * delivered.  If it is NULL, plug_enqueue will set it to the
>+	 * next packet it sees.
>+	 *
>+	 * release is the last packet in the fifo that can be
>+	 * released.
>+	 */
>+	struct sk_buff *stop, *release;
>+};
>+
>+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>+{
>+	struct plug_sched_data *q = qdisc_priv(sch);
>+
>+	if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
>+		if (!q->stop)
>+			q->stop = skb;
>+		return qdisc_enqueue_tail(skb, sch);
>+	}
>+	printk(KERN_WARNING "queue reported full: %u,%u (limit=%u)\n",
>+	       sch->qstats.backlog, skb->len, q->limit);

This is not required, tc -s show will print overflows. KERN_WARNING will flood
the log.

>+	return qdisc_reshape_fail(skb, sch);
>+}
>+
>+/* dequeue doesn't actually dequeue until the release command is
>+ * received. */
>+static struct sk_buff *plug_dequeue(struct Qdisc *sch)
>+{
>+	struct plug_sched_data *q = qdisc_priv(sch);
>+	struct sk_buff *peek;
>+
>+	if (qdisc_is_throttled(sch))
>+		return NULL;
>+
>+	peek = (struct sk_buff *)((sch->q).next);
>+
>+	if (peek == q->release) {
>+		/*
>+		 * This is the tail of the last round. Release it and
>+		 * block the queue
>+		 */
>+		qdisc_throttled(sch);
>+		return NULL;
>+	}
>+
>+	return qdisc_dequeue_head(sch);
>+}
>+
>+static int plug_init(struct Qdisc *sch, struct nlattr *opt)
>+{
>+	struct plug_sched_data *q = qdisc_priv(sch);
>+
>+	if (opt == NULL) {
>+		u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 1;

1? A little bit small default value?

>+		q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
>+	} else {
>+		struct tc_plug_qopt *ctl = nla_data(opt);
>+
>+		if (nla_len(opt) < sizeof(*ctl))
>+			return -EINVAL;
>+
>+		q->limit = ctl->limit;
>+	}
>+
>+	printk(KERN_DEBUG "sch_plug queue loaded with limit %u\n", q->limit);
>+	qdisc_throttled(sch);
>+	return 0;
>+}
>+
>+/* Receives 3 types of messages:
>+ * TCQ_PLUG_BUFFER: Inset a plug into the queue and
>+ *  buffer any incoming packets
>+ * TCQ_PLUG_RELEASE: Dequeue packets from queue head
>+ *   to beginning of the next plug.
>+ * TCQ_PLUG_LIMIT: Increase/decrease queue size

Why not an additional, unlimited state? TCQ_PLUG_RELEASE dequeue packets until
stop pointer, but why not a complete message driven mode without the stop (e.g.
send until TCQ_PLUG_BUFFER message is received?

This would make the qdisc more generic and useable for other users.

Hagen
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jamal Hadi Salim Jan. 30, 2012, 2:45 p.m. UTC | #2
Please address the comment from Hagen on the printk (kill it).
Other than that i think this simple enough and non-intrusive
to other users that you can add an Acked-by from me.
It will be nice to get it to be generic as per Hagen's and my
earlier comments. 

cheers,
jamal

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hagen Paul Pfeifer Jan. 30, 2012, 2:54 p.m. UTC | #3
On Mon, 30 Jan 2012 09:45:56 -0500, Jamal Hadi Salim wrote:

> Please address the comment from Hagen on the printk (kill it).
> Other than that i think this simple enough and non-intrusive
> to other users that you can add an Acked-by from me.
> It will be nice to get it to be generic as per Hagen's and my
> earlier comments.

I thought about sch_plug a second time and I like the idea of a user-space
based flow-controlled queue. But as I wrote in my first email: the current
implementation is a little bit XEN specific. Make it more general
(plug/unplug, plug/packet-based-unplug) and it is a great contribution.

Hagen

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
rshriram@cs.ubc.ca Jan. 30, 2012, 4:47 p.m. UTC | #4
On 2012-01-30, at 6:54 AM, Hagen Paul Pfeifer <hagen@jauu.net> wrote:

> 
> On Mon, 30 Jan 2012 09:45:56 -0500, Jamal Hadi Salim wrote:
> 
>> Please address the comment from Hagen on the printk (kill it).
>> Other than that i think this simple enough and non-intrusive
>> to other users that you can add an Acked-by from me.
>> It will be nice to get it to be generic as per Hagen's and my
>> earlier comments.
> 
> I thought about sch_plug a second time and I like the idea of a user-space
> based flow-controlled queue. But as I wrote in my first email: the current
> implementation is a little bit XEN specific. Make it more general
> (plug/unplug

The use case I cited in the kconfig was xen specific. 
The plug and unplug functionality is already there and is generic.

I can certainly rename the tcq #defs :)

> , plug/packet-based-unplug) and it is a great contribution.
> 

Could you elaborate a little on the packet-based-unplug ?

I got your earlier comment on "indefinite unplug" until an explicit
plug is received. Is that what you mean by packet-based-unplug ?


Shriram
> Hagen
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hagen Paul Pfeifer Jan. 31, 2012, 3:25 p.m. UTC | #5
On Mon, 30 Jan 2012 08:47:45 -0800, Shriram Rajagopalan wrote:

> Could you elaborate a little on the packet-based-unplug ?
> 
> I got your earlier comment on "indefinite unplug" until an explicit
> plug is received. Is that what you mean by packet-based-unplug ?

Sure, imagine a multihop MANET network. Sometimes we have high-priority
crosstraffic in the next hop (router). Due to OLSR traffic information we
know in advance that the next hop is not in the ability to forward our low
priority packets. With this knowledge we can stop (unplug) local generated
and forwarded traffic and if the next hop has enough free bandwidth we can
restart (plug) sending already enqueued packets.

So for us a really simple plug/unplug mechanism is superior. Maybe a
head-drop FIFO based policy for forwarded traffic but I can provide a patch
on top of your patch to implement a head-drop policy.

Hagen
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
rshriram@cs.ubc.ca Jan. 31, 2012, 4:20 p.m. UTC | #6
On 2012-01-31, at 7:25 AM, Hagen Paul Pfeifer <hagen@jauu.net> wrote:

> 
> On Mon, 30 Jan 2012 08:47:45 -0800, Shriram Rajagopalan wrote:
> 
>> Could you elaborate a little on the packet-based-unplug ?
>> 
>> I got your earlier comment on "indefinite unplug" until an explicit
>> plug is received. Is that what you mean by packet-based-unplug ?
> 
> Sure, imagine a multihop MANET network. Sometimes we have high-priority
> crosstraffic in the next hop (router). Due to OLSR traffic information we
> know in advance that the next hop is not in the ability to forward our low
> priority packets. With this knowledge we can stop (unplug) local generated
> and forwarded traffic and if the next hop has enough free bandwidth we can
> restart (plug) sending already enqueued packets.
> 

I am assuming you mean stop(plug) and restart (unplug). But I get the idea.
Indefinite buffering (subject to qlimit) and release.

Will work on it. Thanks for the cool example :).

Shriram
> So for us a really simple plug/unplug mechanism is superior. Maybe a
> head-drop FIFO based policy for forwarded traffic but I can provide a patch
> on top of your patch to implement a head-drop policy.
> 
> Hagen
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 0d5b793..14ad024 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -127,6 +127,22 @@  struct tc_multiq_qopt {
 	__u16	max_bands;		/* Maximum number of queues */
 };
 
+/* PLUG section */
+
+#define TCQ_PLUG_BUFFER   0
+#define TCQ_PLUG_RELEASE  1
+#define TCQ_PLUG_LIMIT    2
+struct tc_plug_qopt {
+	/* TCQ_PLUG_BUFFER: Inset a plug into the queue and
+	 *  buffer any incoming packets
+	 * TCQ_PLUG_RELEASE: Dequeue packets from queue head
+	 *   to beginning of the next plug.
+	 * TCQ_PLUG_LIMIT: Increase/decrease queue size
+	 */
+	int             action;
+	__u32           limit;
+};
+
 /* TBF section */
 
 struct tc_tbf_qopt {
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2590e91..d763112 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -260,6 +260,25 @@  config NET_SCH_INGRESS
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_ingress.
 
+config NET_SCH_PLUG
+	tristate "Plug network traffic until release (PLUG)"
+	---help---
+	  Say Y here if you are using this kernel for Xen dom0 and
+	  want to protect Xen guests with Remus.
+
+	  This queueing discipline is controlled by netlink. When it receives an
+	  enqueue command it inserts a plug into the outbound queue that causes
+	  following packets to enqueue until a dequeue command arrives over
+	  netlink, releasing packets up to the plug for delivery.
+
+	  This module provides "output buffering" functionality in the Remus HA
+	  project. It enables speculative execution of virtual machines by allowing
+	  the generated network output to be rolled back if needed. For more
+	  information, please refer to http://wiki.xensource.com/xenwiki/Remus
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_plug.
+
 comment "Classification"
 
 config NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dc5889c..8cdf4e2 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,6 +33,7 @@  obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
 obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
new file mode 100644
index 0000000..d194cd2
--- /dev/null
+++ b/net/sched/sch_plug.c
@@ -0,0 +1,161 @@ 
+/*
+ * sch_plug.c Queue traffic until an explicit release command
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ *
+ * The operation of the buffer is as follows:
+ * When a checkpoint begins, a plug is inserted into the
+ *   network queue by a netlink request (it operates by storing
+ *   a pointer to the next packet which arrives and blocking dequeue
+ *   when that packet is at the head of the queue).
+ * When a checkpoint completes (the backup acknowledges receipt),
+ *   currently-queued packets are released.
+ * So it supports two operations, plug and unplug.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+
+struct plug_sched_data {
+	u32 limit;
+	/*
+	 * stop points to the first packet which should not be
+	 * delivered.  If it is NULL, plug_enqueue will set it to the
+	 * next packet it sees.
+	 *
+	 * release is the last packet in the fifo that can be
+	 * released.
+	 */
+	struct sk_buff *stop, *release;
+};
+
+static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+
+	if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
+		if (!q->stop)
+			q->stop = skb;
+		return qdisc_enqueue_tail(skb, sch);
+	}
+	printk(KERN_WARNING "queue reported full: %u,%u (limit=%u)\n",
+	       sch->qstats.backlog, skb->len, q->limit);
+
+	return qdisc_reshape_fail(skb, sch);
+}
+
+/* dequeue doesn't actually dequeue until the release command is
+ * received. */
+static struct sk_buff *plug_dequeue(struct Qdisc *sch)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *peek;
+
+	if (qdisc_is_throttled(sch))
+		return NULL;
+
+	peek = (struct sk_buff *)((sch->q).next);
+
+	if (peek == q->release) {
+		/*
+		 * This is the tail of the last round. Release it and
+		 * block the queue
+		 */
+		qdisc_throttled(sch);
+		return NULL;
+	}
+
+	return qdisc_dequeue_head(sch);
+}
+
+static int plug_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+
+	if (opt == NULL) {
+		u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 1;
+		q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
+	} else {
+		struct tc_plug_qopt *ctl = nla_data(opt);
+
+		if (nla_len(opt) < sizeof(*ctl))
+			return -EINVAL;
+
+		q->limit = ctl->limit;
+	}
+
+	printk(KERN_DEBUG "sch_plug queue loaded with limit %u\n", q->limit);
+	qdisc_throttled(sch);
+	return 0;
+}
+
+/* Receives 3 types of messages:
+ * TCQ_PLUG_BUFFER: Inset a plug into the queue and
+ *  buffer any incoming packets
+ * TCQ_PLUG_RELEASE: Dequeue packets from queue head
+ *   to beginning of the next plug.
+ * TCQ_PLUG_LIMIT: Increase/decrease queue size
+ */
+static int plug_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct plug_sched_data *q = qdisc_priv(sch);
+	struct tc_plug_qopt *msg;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	msg = nla_data(opt);
+	if (nla_len(opt) < sizeof(*msg))
+		return -EINVAL;
+
+	if (msg->action == TCQ_PLUG_BUFFER) {
+		/* reset stop, so that we can start buffering
+		 * from the next incoming packet.
+		 */
+		q->stop = NULL;
+	} else if (msg->action == TCQ_PLUG_RELEASE) {
+		/* Dequeue from queue head until the stop pointer */
+		q->release = q->stop;
+		qdisc_unthrottled(sch);
+		netif_schedule_queue(sch->dev_queue);
+	} else if (msg->action == TCQ_PLUG_LIMIT) {
+		/* Limit is supplied in bytes */
+		q->limit = msg->limit;
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct Qdisc_ops plug_qdisc_ops = {
+	.id          =       "plug",
+	.priv_size   =       sizeof(struct plug_sched_data),
+	.enqueue     =       plug_enqueue,
+	.dequeue     =       plug_dequeue,
+	.peek        =       qdisc_peek_head,
+	.init        =       plug_init,
+	.change      =       plug_change,
+	.owner       =       THIS_MODULE,
+};
+
+static int __init plug_module_init(void)
+{
+	return register_qdisc(&plug_qdisc_ops);
+}
+
+static void __exit plug_module_exit(void)
+{
+	unregister_qdisc(&plug_qdisc_ops);
+}
+module_init(plug_module_init)
+module_exit(plug_module_exit)
+MODULE_LICENSE("GPL");