diff mbox

[net-next] net: sched: use no more than one page in struct fw_head

Message ID 1394985990.9668.26.camel@edumazet-glaptop2.roam.corp.google.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet March 16, 2014, 4:06 p.m. UTC
From: Eric Dumazet <edumazet@google.com>

In commit b4e9b520ca5d ("[NET_SCHED]: Add mask support to fwmark
classifier") Patrick added an u32 field in fw_head, making it slightly
bigger than one page.

Change the layout of this structure and let compiler emit a reciprocal
divide for fw_hash(), as this makes the core more readable and
more efficient those days.

This brings back the memory usage to a single page, and permits John
to add a rcu_head later without any worry.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: John Fastabend <john.fastabend@gmail.com>
---
 net/sched/cls_fw.c |   38 ++++++++++----------------------------
 1 file changed, 10 insertions(+), 28 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Thomas Graf March 17, 2014, 1:51 p.m. UTC | #1
On 03/16/14 at 09:06am, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> In commit b4e9b520ca5d ("[NET_SCHED]: Add mask support to fwmark
> classifier") Patrick added an u32 field in fw_head, making it slightly
> bigger than one page.
> 
> Change the layout of this structure and let compiler emit a reciprocal
> divide for fw_hash(), as this makes the core more readable and
> more efficient those days.

I think you  need to educate me a bit on this. objdump
spits out the following:

static u32 fw_hash(u32 handle)
{
        return handle % HTSIZE;
  1d:   bf ff 01 00 00          mov    edi,0x1ff
  22:   89 f0                   mov    eax,esi
  24:   31 d2                   xor    edx,edx
  26:   f7 f7                   div    edi

Doesn't look like a reciprocal div to me. Where did I
screw up or why doesn't gcc optimize it properly?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 17, 2014, 2:13 p.m. UTC | #2
On Mon, 2014-03-17 at 13:51 +0000, Thomas Graf wrote:
> On 03/16/14 at 09:06am, Eric Dumazet wrote:
> > From: Eric Dumazet <edumazet@google.com>
> > 
> > In commit b4e9b520ca5d ("[NET_SCHED]: Add mask support to fwmark
> > classifier") Patrick added an u32 field in fw_head, making it slightly
> > bigger than one page.
> > 
> > Change the layout of this structure and let compiler emit a reciprocal
> > divide for fw_hash(), as this makes the core more readable and
> > more efficient those days.
> 
> I think you  need to educate me a bit on this. objdump
> spits out the following:
> 
> static u32 fw_hash(u32 handle)
> {
>         return handle % HTSIZE;
>   1d:   bf ff 01 00 00          mov    edi,0x1ff
>   22:   89 f0                   mov    eax,esi
>   24:   31 d2                   xor    edx,edx
>   26:   f7 f7                   div    edi
> 
> Doesn't look like a reciprocal div to me. Where did I
> screw up or why doesn't gcc optimize it properly?
> --

Thats because on your cpu, gcc knows the divide is cheaper than anything
else (a multiply followed by a shift)

What are your exact CFLAGS ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 17, 2014, 3:16 p.m. UTC | #3
On Mon, 2014-03-17 at 14:29 +0000, David Laight wrote:

> Especially for a modulus operation - which requires a second multiply
> and probably has issues with some large values.
> 
> For a hash you could use '(handle * 0x1ffull) >> 32' to reduce the hash.

Nice try, but this wont work.

Most of the time @handle is a small value, for example in the 0 .. 100
range.


> 
> or use a 'modulo 2^n-1 reduction':
> 	handle = (handle & 0x3ffff) + (handle >> 18);
> 	do
> 		handle = (handle & 0x1ff) + (handle >> 9);
> 	while (handle > 0x1ffu);

All these tricks were needed 20 years ago.

Even Thomas laptop can do a divide faster than this loop.

Let the compiler do the thing itself, it is its job.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Graf March 17, 2014, 3:28 p.m. UTC | #4
On 03/17/14 at 07:13am, Eric Dumazet wrote:
> On Mon, 2014-03-17 at 13:51 +0000, Thomas Graf wrote:
> > On 03/16/14 at 09:06am, Eric Dumazet wrote:
> > > From: Eric Dumazet <edumazet@google.com>
> > > 
> > > In commit b4e9b520ca5d ("[NET_SCHED]: Add mask support to fwmark
> > > classifier") Patrick added an u32 field in fw_head, making it slightly
> > > bigger than one page.
> > > 
> > > Change the layout of this structure and let compiler emit a reciprocal
> > > divide for fw_hash(), as this makes the core more readable and
> > > more efficient those days.
> > 
> > I think you  need to educate me a bit on this. objdump
> > spits out the following:
> > 
> > static u32 fw_hash(u32 handle)
> > {
> >         return handle % HTSIZE;
> >   1d:   bf ff 01 00 00          mov    edi,0x1ff
> >   22:   89 f0                   mov    eax,esi
> >   24:   31 d2                   xor    edx,edx
> >   26:   f7 f7                   div    edi
> > 
> > Doesn't look like a reciprocal div to me. Where did I
> > screw up or why doesn't gcc optimize it properly?
> > --
> 
> Thats because on your cpu, gcc knows the divide is cheaper than anything
> else (a multiply followed by a shift)

OK.

> What are your exact CFLAGS ?

  gcc -Wp,-MD,net/sched/.cls_fw.o.d  -nostdinc -isystem
/usr/lib/gcc/x86_64-redhat-linux/4.8.2/include
-I/home/tgraf/dev/linux/net/arch/x86/include
-Iarch/x86/include/generated  -Iinclude
-I/home/tgraf/dev/linux/net/arch/x86/include/uapi
-Iarch/x86/include/generated/uapi
-I/home/tgraf/dev/linux/net/include/uapi -Iinclude/generated/uapi
-include /home/tgraf/dev/linux/net/include/linux/kconfig.h
-D__KERNEL__ -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs
-fno-strict-aliasing -fno-common -Werror-implicit-function-declaration
-Wno-format-security -fno-delete-null-pointer-checks -Os
-Wno-maybe-uninitialized -m64 -mno-mmx -mno-sse
-mpreferred-stack-boundary=3 -mtune=generic -mno-red-zone
-mcmodel=kernel -funit-at-a-time -maccumulate-outgoing-args
-DCONFIG_X86_X32_ABI -DCONFIG_AS_CFI=1 -DCONFIG_AS_CFI_SIGNAL_FRAME=1
-DCONFIG_AS_CFI_SECTIONS=1 -DCONFIG_AS_FXSAVEQ=1 -DCONFIG_AS_AVX=1
-DCONFIG_AS_AVX2=1 -pipe -Wno-sign-compare
-fno-asynchronous-unwind-tables -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-mno-avx -fno-reorder-blocks -fno-ipa-cp-clone -fno-partial-inlining
-Wframe-larger-than=2048 -fno-stack-protector
-Wno-unused-but-set-variable -fno-omit-frame-pointer
-fno-optimize-sibling-calls -g -femit-struct-debug-baseonly
-fno-var-tracking -pg -mfentry -DCC_USING_FENTRY
-fno-inline-functions-called-once -Wdeclaration-after-statement
-Wno-pointer-sign -fno-strict-overflow -fconserve-stack
-Werror=implicit-int -Werror=strict-prototypes -DCC_HAVE_ASM_GOTO
-fprofile-arcs -ftest-coverage  -DMODULE  -D"KBUILD_STR(s)=#s"
-D"KBUILD_BASENAME=KBUILD_STR(cls_fw)"
-D"KBUILD_MODNAME=KBUILD_STR(cls_fw)" -c -o net/sched/.tmp_cls_fw.o
net/sched/cls_fw.c

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Graf March 17, 2014, 3:30 p.m. UTC | #5
On 03/17/14 at 08:16am, Eric Dumazet wrote:
> On Mon, 2014-03-17 at 14:29 +0000, David Laight wrote:
> 
> > Especially for a modulus operation - which requires a second multiply
> > and probably has issues with some large values.
> > 
> > For a hash you could use '(handle * 0x1ffull) >> 32' to reduce the hash.
> 
> Nice try, but this wont work.
> 
> Most of the time @handle is a small value, for example in the 0 .. 100
> range.
> 
> 
> > 
> > or use a 'modulo 2^n-1 reduction':
> > 	handle = (handle & 0x3ffff) + (handle >> 18);
> > 	do
> > 		handle = (handle & 0x1ff) + (handle >> 9);
> > 	while (handle > 0x1ffu);
> 
> All these tricks were needed 20 years ago.
> 
> Even Thomas laptop can do a divide faster than this loop.
> 
> Let the compiler do the thing itself, it is its job.

So why do we have reciprocal_divide()?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 17, 2014, 3:33 p.m. UTC | #6
On Mon, 2014-03-17 at 15:30 +0000, Thomas Graf wrote:

> So why do we have reciprocal_divide()?

Thats when you do not have a constant divisor ;)

Gcc uses a reciprocal divide if the divisor is known/constant.

If not, it _has_ to use a real divide.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight March 17, 2014, 3:43 p.m. UTC | #7
From: Eric Dumazet ...

> > or use a 'modulo 2^n-1 reduction':

> > 	handle = (handle & 0x3ffff) + (handle >> 18);

> > 	do

> > 		handle = (handle & 0x1ff) + (handle >> 9);

> > 	while (handle > 0x1ffu);

> 

> All these tricks were needed 20 years ago.

> 

> Even Thomas laptop can do a divide faster than this loop.


The nios cpu on my fpga can't :-)
OTOH I wouldn't run Linux on a 100Mhz cpu.

	David
Thomas Graf March 17, 2014, 3:50 p.m. UTC | #8
On 03/17/14 at 03:28pm, Thomas Graf wrote:
> On 03/17/14 at 07:13am, Eric Dumazet wrote:
> > On Mon, 2014-03-17 at 13:51 +0000, Thomas Graf wrote:
> > > On 03/16/14 at 09:06am, Eric Dumazet wrote:
> > > > From: Eric Dumazet <edumazet@google.com>
> > > > 
> > > > In commit b4e9b520ca5d ("[NET_SCHED]: Add mask support to fwmark
> > > > classifier") Patrick added an u32 field in fw_head, making it slightly
> > > > bigger than one page.
> > > > 
> > > > Change the layout of this structure and let compiler emit a reciprocal
> > > > divide for fw_hash(), as this makes the core more readable and
> > > > more efficient those days.
> > > 
> > > I think you  need to educate me a bit on this. objdump
> > > spits out the following:
> > > 
> > > static u32 fw_hash(u32 handle)
> > > {
> > >         return handle % HTSIZE;
> > >   1d:   bf ff 01 00 00          mov    edi,0x1ff
> > >   22:   89 f0                   mov    eax,esi
> > >   24:   31 d2                   xor    edx,edx
> > >   26:   f7 f7                   div    edi
> > > 
> > > Doesn't look like a reciprocal div to me. Where did I
> > > screw up or why doesn't gcc optimize it properly?
> > > --
> > 
> > Thats because on your cpu, gcc knows the divide is cheaper than anything
> > else (a multiply followed by a shift)
> 
> OK.

[0] lists 7-17 cycles for DIV r32 on Nehalem or 17-28 in terms of
latency. Benefit of the data fitting into a single page clearly
outweights this slight increase in instructions.

Acked-by: Thomas Graf <tgraf@suug.ch>

[0] http://www.agner.org/optimize/instruction_tables.pdf
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 17, 2014, 3:52 p.m. UTC | #9
On Mon, 2014-03-17 at 15:43 +0000, David Laight wrote:

> The nios cpu on my fpga can't :-)

In this case, gcc will use a reciprocal divide.

Depending on the divisor it can be one or two multiply, and some
trivial arithmetics.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight March 17, 2014, 4 p.m. UTC | #10
From: Thomas Graf
> On 03/17/14 at 03:28pm, Thomas Graf wrote:
> > On 03/17/14 at 07:13am, Eric Dumazet wrote:
> > > On Mon, 2014-03-17 at 13:51 +0000, Thomas Graf wrote:
> > > > On 03/16/14 at 09:06am, Eric Dumazet wrote:
> > > > > From: Eric Dumazet <edumazet@google.com>
> > > > >
> > > > > In commit b4e9b520ca5d ("[NET_SCHED]: Add mask support to fwmark
> > > > > classifier") Patrick added an u32 field in fw_head, making it slightly
> > > > > bigger than one page.
> > > > >
> > > > > Change the layout of this structure and let compiler emit a reciprocal
> > > > > divide for fw_hash(), as this makes the core more readable and
> > > > > more efficient those days.
> > > >
> > > > I think you  need to educate me a bit on this. objdump
> > > > spits out the following:
> > > >
> > > > static u32 fw_hash(u32 handle)
> > > > {
> > > >         return handle % HTSIZE;
> > > >   1d:   bf ff 01 00 00          mov    edi,0x1ff
> > > >   22:   89 f0                   mov    eax,esi
> > > >   24:   31 d2                   xor    edx,edx
> > > >   26:   f7 f7                   div    edi
> > > >
> > > > Doesn't look like a reciprocal div to me. Where did I
> > > > screw up or why doesn't gcc optimize it properly?
> > > > --
> > >
> > > Thats because on your cpu, gcc knows the divide is cheaper than anything
> > > else (a multiply followed by a shift)
> >
> > OK.
> 
> [0] lists 7-17 cycles for DIV r32 on Nehalem or 17-28 in terms of
> latency. Benefit of the data fitting into a single page clearly
> outweights this slight increase in instructions.

Actually the -Os forces the divide.
With -O3 the generated code I get for amd64 is a multiply by reciprocal
to get the quotient followed by an open coded multiply by 0x1ff and
then a subtract. 13 instructions, only a few of which are register renames
or are non-dependant.

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 17, 2014, 4:16 p.m. UTC | #11
On Mon, 2014-03-17 at 16:00 +0000, David Laight wrote:

> Actually the -Os forces the divide.

Yep, this might be the case.

> With -O3 the generated code I get for amd64 is a multiply by reciprocal
> to get the quotient followed by an open coded multiply by 0x1ff and
> then a subtract. 13 instructions, only a few of which are register renames
> or are non-dependant.

I think its more 4 arithmetic instructions : 2 multiplies, one shift,
one sub.

Note that when John adds an rcu_head, the 0x1ff becomes 0x1fd

   	89 f8                	mov    %edi,%eax
   	ba b3 21 c1 80       	mov    $0x80c121b3,%edx
   	f7 e2                	mul    %edx
   	c1 ea 08             	shr    $0x8,%edx
  	69 d2 fd 01 00 00    	imul   $0x1fd,%edx,%edx
  	29 d7                	sub    %edx,%edi
  	89 f8                	mov    %edi,%eax



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 18, 2014, 2:31 a.m. UTC | #12
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 16 Mar 2014 09:06:30 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> In commit b4e9b520ca5d ("[NET_SCHED]: Add mask support to fwmark
> classifier") Patrick added an u32 field in fw_head, making it slightly
> bigger than one page.
> 
> Change the layout of this structure and let compiler emit a reciprocal
> divide for fw_hash(), as this makes the core more readable and
> more efficient those days.
> 
> This brings back the memory usage to a single page, and permits John
> to add a rcu_head later without any worry.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Eric let's just use an HSIZE of 256 and be done with it.

Then all of this talk of divides and cost goes out the window, as we
can assume a power of two and use simple masking.

This is basically what other classifiers in similar situations do.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 18, 2014, 3:02 a.m. UTC | #13
On Mon, 2014-03-17 at 22:31 -0400, David Miller wrote:

> Eric let's just use an HSIZE of 256 and be done with it.
> 
> Then all of this talk of divides and cost goes out the window, as we
> can assume a power of two and use simple masking.
> 
> This is basically what other classifiers in similar situations do.

Yep, will send a v2.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index a366537f82c6..e054adafe836 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -29,13 +29,16 @@ 
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
 
-#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
-
 struct fw_head {
-	struct fw_filter *ht[HTSIZE];
-	u32 mask;
+	u32			mask;
+	struct fw_filter	*ht[0];
 };
 
+#define FW_HEAD_SIZE PAGE_SIZE
+
+#define HTSIZE ((FW_HEAD_SIZE - sizeof(struct fw_head)) /	\
+		sizeof(struct fw_filter *))
+
 struct fw_filter {
 	struct fw_filter	*next;
 	u32			id;
@@ -46,30 +49,9 @@  struct fw_filter {
 	struct tcf_exts		exts;
 };
 
-static inline int fw_hash(u32 handle)
+static u32 fw_hash(u32 handle)
 {
-	if (HTSIZE == 4096)
-		return ((handle >> 24) & 0xFFF) ^
-		       ((handle >> 12) & 0xFFF) ^
-		       (handle & 0xFFF);
-	else if (HTSIZE == 2048)
-		return ((handle >> 22) & 0x7FF) ^
-		       ((handle >> 11) & 0x7FF) ^
-		       (handle & 0x7FF);
-	else if (HTSIZE == 1024)
-		return ((handle >> 20) & 0x3FF) ^
-		       ((handle >> 10) & 0x3FF) ^
-		       (handle & 0x3FF);
-	else if (HTSIZE == 512)
-		return (handle >> 27) ^
-		       ((handle >> 18) & 0x1FF) ^
-		       ((handle >> 9) & 0x1FF) ^
-		       (handle & 0x1FF);
-	else if (HTSIZE == 256) {
-		u8 *t = (u8 *) &handle;
-		return t[0] ^ t[1] ^ t[2] ^ t[3];
-	} else
-		return handle & (HTSIZE - 1);
+	return handle % HTSIZE;
 }
 
 static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -266,7 +248,7 @@  static int fw_change(struct net *net, struct sk_buff *in_skb,
 		if (tb[TCA_FW_MASK])
 			mask = nla_get_u32(tb[TCA_FW_MASK]);
 
-		head = kzalloc(sizeof(struct fw_head), GFP_KERNEL);
+		head = kzalloc(FW_HEAD_SIZE, GFP_KERNEL);
 		if (head == NULL)
 			return -ENOBUFS;
 		head->mask = mask;