diff mbox

[v3,net-next,2/2] netem: add cell concept to simulate special MAC behavior

Message ID 1322691627-20551-2-git-send-email-hagen@jauu.net
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Hagen Paul Pfeifer Nov. 30, 2011, 10:20 p.m. UTC
This extension can be used to simulate special link layer
characteristics. Simulate because packet data is not modified, only the
calculation base is changed to delay a packet based on the original
packet size and artificial cell information.

packet_overhead can be used to simulate a link layer header compression
scheme (e.g. set packet_overhead to -20) or with a positive
packet_overhead value an additional MAC header can be simulated. It is
also possible to "replace" the 14 byte Ethernet header with something
else.

cell_size and cell_overhead can be used to simulate link layer schemes,
based on cells, like some TDMA schemes. Another application area are MAC
schemes using a link layer fragmentation with a (small) header each.
Cell size is the maximum amount of data bytes within one cell. Cell
overhead is an additional variable to change the per-cell-overhead (e.g.
5 byte header per fragment).

Example (5 kbit/s, 20 byte per packet overhead, cell-size 100 byte, per
cell overhead 5 byte):

	tc qdisc add dev eth0 root netem rate 5kbit 20 100 5

Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
---

The actual version of packet_len_2_sched_time() address Eric's div/mod
instruction concerns. I benchmarked the version in the patch with the
following version:


	if (q->cell_size) {
		u32 mod_carry = len % q->cell_size;
		u32 cells     = len / q->cell_size;
		if (mod_carry)
			mod_carry = (len > q->cell_size || !cells) ?
				q->cell_size - mod_carry : len - mod_carry;

		if (q->cell_overhead) {
			if (mod_carry)
				++cells;
			len += cells * q->cell_overhead;
		}
		len += mod_carry;
	}
	return len;


The patch version is a little bit faster for "all" packet sizes. For common
cases (e.g. max. 1000 byte packets, cellsize 100 byte, the patch version
exhibit significant improvements). IMHO the actual version is also more
understandable. Replace div and mod by do_div() was not that successful.


 include/linux/pkt_sched.h |    3 +++
 net/sched/sch_netem.c     |   32 +++++++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 3 deletions(-)

Comments

Eric Dumazet Dec. 1, 2011, 3:30 a.m. UTC | #1
Le mercredi 30 novembre 2011 à 23:20 +0100, Hagen Paul Pfeifer a écrit :
> This extension can be used to simulate special link layer
> characteristics. Simulate because packet data is not modified, only the
> calculation base is changed to delay a packet based on the original
> packet size and artificial cell information.
> 
> packet_overhead can be used to simulate a link layer header compression
> scheme (e.g. set packet_overhead to -20) or with a positive
> packet_overhead value an additional MAC header can be simulated. It is
> also possible to "replace" the 14 byte Ethernet header with something
> else.
> 
> cell_size and cell_overhead can be used to simulate link layer schemes,
> based on cells, like some TDMA schemes. Another application area are MAC
> schemes using a link layer fragmentation with a (small) header each.
> Cell size is the maximum amount of data bytes within one cell. Cell
> overhead is an additional variable to change the per-cell-overhead (e.g.
> 5 byte header per fragment).
> 
> Example (5 kbit/s, 20 byte per packet overhead, cell-size 100 byte, per
> cell overhead 5 byte):
> 
> 	tc qdisc add dev eth0 root netem rate 5kbit 20 100 5
> 
> Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
> ---
> 
> The actual version of packet_len_2_sched_time() address Eric's div/mod
> instruction concerns. I benchmarked the version in the patch with the
> following version:
> 
> 
> 	if (q->cell_size) {
> 		u32 mod_carry = len % q->cell_size;
> 		u32 cells     = len / q->cell_size;
> 		if (mod_carry)
> 			mod_carry = (len > q->cell_size || !cells) ?
> 				q->cell_size - mod_carry : len - mod_carry;
> 
> 		if (q->cell_overhead) {
> 			if (mod_carry)
> 				++cells;
> 			len += cells * q->cell_overhead;
> 		}
> 		len += mod_carry;
> 	}
> 	return len;
> 
> 
> The patch version is a little bit faster for "all" packet sizes. For common
> cases (e.g. max. 1000 byte packets, cellsize 100 byte, the patch version
> exhibit significant improvements). IMHO the actual version is also more
> understandable. Replace div and mod by do_div() was not that successful.
> 
> 
>  include/linux/pkt_sched.h |    3 +++
>  net/sched/sch_netem.c     |   32 +++++++++++++++++++++++++++++---
>  2 files changed, 32 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index 26c37ca..63845cf 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -498,6 +498,9 @@ struct tc_netem_corrupt {
>  
>  struct tc_netem_rate {
>  	__u32	rate;	/* byte/s */
> +	__s32   packet_overhead;
> +	__u32   cell_size;
> +	__s32   cell_overhead;
>  };
>  
>  enum {
> diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
> index 9b7af9f..bcd2b3f 100644
> --- a/net/sched/sch_netem.c
> +++ b/net/sched/sch_netem.c
> @@ -80,6 +80,9 @@ struct netem_sched_data {
>  	u32 reorder;
>  	u32 corrupt;
>  	u32 rate;
> +	s32 packet_overhead;
> +	u32 cell_size;
> +	s32 cell_overhead;
>  
>  	struct crndstate {
>  		u32 last;
> @@ -299,9 +302,26 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
>  	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
>  }
>  
> -static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate)
> +static psched_time_t packet_len_2_sched_time(unsigned int len,
> +					     struct netem_sched_data *q)
>  {
> -	return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / rate);
> +	u32 cells = 0;
> +	u32 datalen;
> +
> +	len += q->packet_overhead;
> +
> +	if (q->cell_size) {
> +		for (datalen = len; datalen >  q->cell_size; datalen -= q->cell_size)
> +			cells++;

Oh well.. you can exit this loop with data len = q->cell_size


Hmm, take a look at reciprocal divide ...

(include/linux/reciprocal_div.h)


Instead of :

u32 cells     = len / q->cell_size;

You set once q->cell_size_reciprocal = reciprocal_value(q->cell_size);
(in Qdisc init)

Then you do :

cells = reciprocal_divide(len, q->cell_size_reciprocal);

Thats a multiply instead of a divide. On many cpus thats a lot faster.

Think about a super packet (TSO) of 65000 bytes and cell_size=64

> +
> +		if (q->cell_overhead)
> +			len += cells * q->cell_overhead;
> +
> +		if (datalen)
> +			len += (q->cell_size - datalen);
> +	}
> +
> +	return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / q->rate);
>  }
>  
>  /*
> @@ -381,7 +401,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>  		if (q->rate) {
>  			struct sk_buff_head *list = &q->qdisc->q;
>  
> -			delay += packet_len_2_sched_time(skb->len, q->rate);
> +			delay += packet_len_2_sched_time(skb->len, q);
>  
>  			if (!skb_queue_empty(list)) {
>  				/*
> @@ -565,6 +585,9 @@ static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
>  	const struct tc_netem_rate *r = nla_data(attr);
>  
>  	q->rate = r->rate;
> +	q->packet_overhead = r->packet_overhead;
> +	q->cell_size       = r->cell_size;
> +	q->cell_overhead   = r->cell_overhead;
>  }
>  
>  static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
> @@ -906,6 +929,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
>  
>  	rate.rate = q->rate;
> +	rate.packet_overhead = q->packet_overhead;
> +	rate.cell_size       = q->cell_size;
> +	rate.cell_overhead   = q->cell_overhead;
>  	NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
>  
>  	if (dump_loss_model(q, skb) != 0)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hagen Paul Pfeifer Dec. 1, 2011, 8:25 a.m. UTC | #2
On Thu, 01 Dec 2011 04:30:25 +0100, Eric Dumazet wrote:

> Thats a multiply instead of a divide. On many cpus thats a lot faster.
>
> Think about a super packet (TSO) of 65000 bytes and cell_size=64

I've never imagined that I am going to say the following: you are wrong,
Eric! (ok, maybe you are right ;-)

TSO and Netem is a no-go. With netem you are strongly advised to disable
offloading. I mean TSO will result in _one_ delay of several minutes,
followed by a burst of packets. Instead of packets spaced by several
seconds (with the rate of 1000byte/s) - which is what you wan't.

To sum up: skb->len is _never_ larger as the MTU for (normal, correct)
network emulation setups with netem. This was the assumption why I
preferred the iterative solution over the div/mod solution.

Did I miss something?

Hagen

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Dec. 1, 2011, 9:01 a.m. UTC | #3
Le jeudi 01 décembre 2011 à 09:25 +0100, Hagen Paul Pfeifer a écrit :
> On Thu, 01 Dec 2011 04:30:25 +0100, Eric Dumazet wrote:
> 
> > Thats a multiply instead of a divide. On many cpus thats a lot faster.
> >
> > Think about a super packet (TSO) of 65000 bytes and cell_size=64
> 
> I've never imagined that I am going to say the following: you are wrong,
> Eric! (ok, maybe you are right ;-)
> 
> TSO and Netem is a no-go. With netem you are strongly advised to disable
> offloading. I mean TSO will result in _one_ delay of several minutes,
> followed by a burst of packets. Instead of packets spaced by several
> seconds (with the rate of 1000byte/s) - which is what you wan't.
> 
> To sum up: skb->len is _never_ larger as the MTU for (normal, correct)
> network emulation setups with netem. This was the assumption why I
> preferred the iterative solution over the div/mod solution.
> 
> Did I miss something?
> 

Yes :)

I want to be able to use netem on a 10Gigabit link, and simulate a 5ms
delay. I already will hit the shared qdisc bottleneck, dont force me to
use small packets !

We did cleanups in net/sched to properly handle large packets as well.
(SFQ for example is OK)

Really, reciprocal divide is the way to go, its faster anyway on modern
cpus than your loop.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hagen Paul Pfeifer Dec. 1, 2011, 9:36 a.m. UTC | #4
On Thu, 01 Dec 2011 10:01:48 +0100, Eric Dumazet wrote:

> Yes :)

damn!

> I want to be able to use netem on a 10Gigabit link, and simulate a 5ms
> delay. I already will hit the shared qdisc bottleneck, dont force me to
> use small packets !

No I don't want that. But with 10Gb/s links you will have packet
scheduling problems anyway - if you focus on an _accurate_ delay. A static
delay differs from rate shaping in use case. In the later we (and probably
you) want a exact/realistic spacing between packets.

Due to timer and scheduling granularity somewhere in between 1bit/s and
10Gb/s netem rate (and tbf) will not scale anymore. You will see burst and
inaccurate spacings, far away from what you want to emulate. For us we want
a realistic and clean behavior, if the result of the emulation is not
identical to the emulated link/device we cannot use it (some background
information). 

Anyway: I was not sure what solution you prefer - for us both are
identical. That's why I presented two solutions, so you can pick up the
favorite one. I will re-code the calculation using a reciprocal divide.
Thanks Eric!

Hagen
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger Dec. 1, 2011, 4:24 p.m. UTC | #5
One idea to do small delays at higher speed is to insert dummy pad frames
into the device. It would mean generating garbage, but would allow for
highly accurate fine grain delay.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight Dec. 1, 2011, 4:38 p.m. UTC | #6
> One idea to do small delays at higher speed is to insert 
> dummy pad frames into the device.
> It would mean generating garbage, but would allow for
> highly accurate fine grain delay.

Not a good idea.
They would have to be sent to a known MAC address
otherwise all the ethernet switches would forward them
on all output ports.

	David


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger Dec. 1, 2011, 4:57 p.m. UTC | #7
On Thu, 1 Dec 2011 16:38:51 -0000
"David Laight" <David.Laight@ACULAB.COM> wrote:

>  
> > One idea to do small delays at higher speed is to insert 
> > dummy pad frames into the device.
> > It would mean generating garbage, but would allow for
> > highly accurate fine grain delay.
> 
> Not a good idea.
> They would have to be sent to a known MAC address
> otherwise all the ethernet switches would forward them
> on all output ports.
> 
> 	David
> 
> 

Yes it would have to be a constant destination, not sure if there
is a discard value in Ethernet protocol spec.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones Dec. 1, 2011, 6:25 p.m. UTC | #8
On 12/01/2011 08:57 AM, Stephen Hemminger wrote:
> On Thu, 1 Dec 2011 16:38:51 -0000
> "David Laight"<David.Laight@ACULAB.COM>  wrote:
>
>>
>>> One idea to do small delays at higher speed is to insert
>>> dummy pad frames into the device.
>>> It would mean generating garbage, but would allow for
>>> highly accurate fine grain delay.
>>
>> Not a good idea.
>> They would have to be sent to a known MAC address
>> otherwise all the ethernet switches would forward them
>> on all output ports.
>>
>> 	David
>>
>>
>
> Yes it would have to be a constant destination, not sure if there
> is a discard value in Ethernet protocol spec.

Aren't there special addresses that aren't supposed to be forwarded by 
(intelligent) switches?  IIRC LLDP uses such things.  Though the IEEE 
may take a dim view of using it for such a purpose, and knuth only knows 
what switch bugs would be uncovered that way...

http://standards.ieee.org/develop/regauth/grpmac/public.html
http://en.wikipedia.org/wiki/Link_Layer_Discovery_Protocol

rick jones
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 26c37ca..63845cf 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -498,6 +498,9 @@  struct tc_netem_corrupt {
 
 struct tc_netem_rate {
 	__u32	rate;	/* byte/s */
+	__s32   packet_overhead;
+	__u32   cell_size;
+	__s32   cell_overhead;
 };
 
 enum {
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 9b7af9f..bcd2b3f 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -80,6 +80,9 @@  struct netem_sched_data {
 	u32 reorder;
 	u32 corrupt;
 	u32 rate;
+	s32 packet_overhead;
+	u32 cell_size;
+	s32 cell_overhead;
 
 	struct crndstate {
 		u32 last;
@@ -299,9 +302,26 @@  static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 }
 
-static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate)
+static psched_time_t packet_len_2_sched_time(unsigned int len,
+					     struct netem_sched_data *q)
 {
-	return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / rate);
+	u32 cells = 0;
+	u32 datalen;
+
+	len += q->packet_overhead;
+
+	if (q->cell_size) {
+		for (datalen = len; datalen >  q->cell_size; datalen -= q->cell_size)
+			cells++;
+
+		if (q->cell_overhead)
+			len += cells * q->cell_overhead;
+
+		if (datalen)
+			len += (q->cell_size - datalen);
+	}
+
+	return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / q->rate);
 }
 
 /*
@@ -381,7 +401,7 @@  static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		if (q->rate) {
 			struct sk_buff_head *list = &q->qdisc->q;
 
-			delay += packet_len_2_sched_time(skb->len, q->rate);
+			delay += packet_len_2_sched_time(skb->len, q);
 
 			if (!skb_queue_empty(list)) {
 				/*
@@ -565,6 +585,9 @@  static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 	const struct tc_netem_rate *r = nla_data(attr);
 
 	q->rate = r->rate;
+	q->packet_overhead = r->packet_overhead;
+	q->cell_size       = r->cell_size;
+	q->cell_overhead   = r->cell_overhead;
 }
 
 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
@@ -906,6 +929,9 @@  static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
 
 	rate.rate = q->rate;
+	rate.packet_overhead = q->packet_overhead;
+	rate.cell_size       = q->cell_size;
+	rate.cell_overhead   = q->cell_overhead;
 	NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
 
 	if (dump_loss_model(q, skb) != 0)