Message ID | 20140210110342.GA15806@visitor2.iram.es (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
> > However, your other solutions are better. > > > > > > > > > > > > > > mask = (FM & 1); > > > > > mask |= (FM << 3) & 0x10; > > > > > mask |= (FM << 6) & 0x100; > > > > > mask |= (FM << 9) & 0x1000; > > > > > mask |= (FM << 12) & 0x10000; > > > > > mask |= (FM << 15) & 0x100000; > > > > > mask |= (FM << 18) & 0x1000000; > > > > > mask |= (FM << 21) & 0x10000000; > > > > > mask *= 15; > > > > > > > > > > should do the job, in less code space and without a single branch. ... > > > > > Another way of optomizing this could be: > > > > > > > > > > mask = (FM & 0x0f) | ((FM << 12) & 0x000f0000); > > > > > mask = (mask & 0x00030003) | ((mask << 6) & 0x03030303); > > > > > mask = (mask & 0x01010101) | ((mask << 3) & 0x10101010); > > > > > mask *= 15; ... > Ok, if you have measured that method1 is faster than method2, let us go for it. > I believe method2 would be faster if you had a large out-of-order execution > window, because more parallelism can be extracted from it, but this is probably > only true for high end cores, which do not need FPU emulation in the first place. FWIW the second has a long dependency chain on 'mask', whereas the first can execute the shift/and in any order and then merge the results. So on most superscalar cpu, or one with result delays for arithmetic, the first is likely to be faster. David
On Mon, Feb 10, 2014 at 11:17:38AM +0000, David Laight wrote: > > > However, your other solutions are better. > > > > > > > > > > > > > > > > > > mask = (FM & 1); > > > > > > mask |= (FM << 3) & 0x10; > > > > > > mask |= (FM << 6) & 0x100; > > > > > > mask |= (FM << 9) & 0x1000; > > > > > > mask |= (FM << 12) & 0x10000; > > > > > > mask |= (FM << 15) & 0x100000; > > > > > > mask |= (FM << 18) & 0x1000000; > > > > > > mask |= (FM << 21) & 0x10000000; > > > > > > mask *= 15; > > > > > > > > > > > > should do the job, in less code space and without a single branch. > ... > > > > > > Another way of optomizing this could be: > > > > > > > > > > > > mask = (FM & 0x0f) | ((FM << 12) & 0x000f0000); > > > > > > mask = (mask & 0x00030003) | ((mask << 6) & 0x03030303); > > > > > > mask = (mask & 0x01010101) | ((mask << 3) & 0x10101010); > > > > > > mask *= 15; > ... > > Ok, if you have measured that method1 is faster than method2, let us go for it. > > I believe method2 would be faster if you had a large out-of-order execution > > window, because more parallelism can be extracted from it, but this is probably > > only true for high end cores, which do not need FPU emulation in the first place. > > FWIW the second has a long dependency chain on 'mask', whereas the first can execute > the shift/and in any order and then merge the results. > So on most superscalar cpu, or one with result delays for arithmetic, the first > is likely to be faster. I disagree, perhaps mostly because the compiler is not clever enough, but right now the code for solution 1 is (actually I have rewritten the code and it reads: mask = (FM & 1) | ((FM << 3) & 0x10) | ((FM << 6) & 0x100) | ((FM << 9) & 0x1000) | ((FM << 12) & 0x10000) | ((FM << 15) & 0x100000) | ((FM << 18) & 0x1000000) | ((FM << 21) & 0x10000000); to avoid sequence point in case it hampers the compiler) and the output is: rlwinm 10,3,3,27,27 # D.11621, FM,, rlwinm 9,3,6,23,23 # D.11621, FM,, or 9,10,9 #, D.11621, D.11621, D.11621 rlwinm 10,3,0,31,31 # D.11621, FM, or 9,9,10 #, D.11621, D.11621, D.11621 rlwinm 10,3,9,19,19 # D.11621, FM,, or 9,9,10 #, D.11621, D.11621, D.11621 rlwinm 10,3,12,15,15 # D.11621, FM,, or 9,9,10 #, D.11621, D.11621, D.11621 rlwinm 10,3,15,11,11 # D.11621, FM,, or 9,9,10 #, D.11621, D.11621, D.11621 rlwinm 10,3,18,7,7 # D.11621, FM,, or 9,9,10 #, D.11621, D.11621, D.11621 rlwinm 3,3,21,3,3 # D.11621, FM,, or 9,9,3 #, mask, D.11621, D.11621 mulli 9,9,15 # mask, mask, see that r9 is used 7 times as both input and output operand, plus once for rlwinm. This gives a dependency length of 8 at least. In the other case (I've deleted the code) the dependency length was significantly shorter. In any case that one is fewer instructions, which is good for occasional use. Gabriel
> I disagree, perhaps mostly because the compiler is not clever enough, but right > now the code for solution 1 is (actually I have rewritten the code > and it reads: > > mask = (FM & 1) > | ((FM << 3) & 0x10) > | ((FM << 6) & 0x100) > | ((FM << 9) & 0x1000) > | ((FM << 12) & 0x10000) > | ((FM << 15) & 0x100000) > | ((FM << 18) & 0x1000000) > | ((FM << 21) & 0x10000000); > to avoid sequence point in case it hampers the compiler) > > and the output is: > > rlwinm 10,3,3,27,27 # D.11621, FM,, > rlwinm 9,3,6,23,23 # D.11621, FM,, > or 9,10,9 #, D.11621, D.11621, D.11621 > rlwinm 10,3,0,31,31 # D.11621, FM, > or 9,9,10 #, D.11621, D.11621, D.11621 > rlwinm 10,3,9,19,19 # D.11621, FM,, > or 9,9,10 #, D.11621, D.11621, D.11621 > rlwinm 10,3,12,15,15 # D.11621, FM,, > or 9,9,10 #, D.11621, D.11621, D.11621 > rlwinm 10,3,15,11,11 # D.11621, FM,, > or 9,9,10 #, D.11621, D.11621, D.11621 > rlwinm 10,3,18,7,7 # D.11621, FM,, > or 9,9,10 #, D.11621, D.11621, D.11621 > rlwinm 3,3,21,3,3 # D.11621, FM,, > or 9,9,3 #, mask, D.11621, D.11621 > mulli 9,9,15 # mask, mask, > > see that r9 is used 7 times as both input and output operand, plus > once for rlwinm. This gives a dependency length of 8 at least. > > In the other case (I've deleted the code) the dependency length > was significantly shorter. In any case that one is fewer instructions, > which is good for occasional use. Hmmm... I hand-counted a dependency length of 8 for the other version. Maybe there are some ppc instructions that reduce it. Stupid compiler :-) Trouble is, I bet that even if you code it as: mask1 = (FM & 1) | ((FM << 3) & 0x10); mask2 = ((FM << 6) & 0x100) | ((FM << 9) & 0x1000); mask3 = ((FM << 12) & 0x10000) | ((FM << 15) & 0x100000); mask4 = ((FM << 18) & 0x1000000) | ((FM << 21) & 0x10000000); mask1 |= mask2; mask3 |= mask4; mask = mask1 | mask3; the compiler will 'optimise' it to the above before code generation. If it doesn't adding () to pair the | might be enough. Then a new version of the compiler will change the behaviour again. David
On Mon, Feb 10, 2014 at 12:32:18PM +0000, David Laight wrote: > > I disagree, perhaps mostly because the compiler is not clever enough, but right > > now the code for solution 1 is (actually I have rewritten the code > > and it reads: > > > > mask = (FM & 1) > > | ((FM << 3) & 0x10) > > | ((FM << 6) & 0x100) > > | ((FM << 9) & 0x1000) > > | ((FM << 12) & 0x10000) > > | ((FM << 15) & 0x100000) > > | ((FM << 18) & 0x1000000) > > | ((FM << 21) & 0x10000000); > > to avoid sequence point in case it hampers the compiler) > > > > and the output is: > > > > rlwinm 10,3,3,27,27 # D.11621, FM,, > > rlwinm 9,3,6,23,23 # D.11621, FM,, > > or 9,10,9 #, D.11621, D.11621, D.11621 > > rlwinm 10,3,0,31,31 # D.11621, FM, > > or 9,9,10 #, D.11621, D.11621, D.11621 > > rlwinm 10,3,9,19,19 # D.11621, FM,, > > or 9,9,10 #, D.11621, D.11621, D.11621 > > rlwinm 10,3,12,15,15 # D.11621, FM,, > > or 9,9,10 #, D.11621, D.11621, D.11621 > > rlwinm 10,3,15,11,11 # D.11621, FM,, > > or 9,9,10 #, D.11621, D.11621, D.11621 > > rlwinm 10,3,18,7,7 # D.11621, FM,, > > or 9,9,10 #, D.11621, D.11621, D.11621 > > rlwinm 3,3,21,3,3 # D.11621, FM,, > > or 9,9,3 #, mask, D.11621, D.11621 > > mulli 9,9,15 # mask, mask, > > > > see that r9 is used 7 times as both input and output operand, plus > > once for rlwinm. This gives a dependency length of 8 at least. > > > > In the other case (I've deleted the code) the dependency length > > was significantly shorter. In any case that one is fewer instructions, > > which is good for occasional use. > > Hmmm... I hand-counted a dependency length of 8 for the other version. > Maybe there are some ppc instructions that reduce it. Either I misread the generated code or I got somewhat less. What helps for method1 is the rotate and mask instructions of PPC. Each of left shift and mask becomes a single rlwinm. > > Stupid compiler :-) Indeed. I've trying to coerce it into generating rlwimi instructions (in which case the whole building of the mask reduces to 8 assembly instructions) and failed. It seems that the compiler lacks some patterns some patterns that would directly map to rlwimi. > Trouble is, I bet that even if you code it as: > mask1 = (FM & 1) | ((FM << 3) & 0x10); > mask2 = ((FM << 6) & 0x100) | ((FM << 9) & 0x1000); > mask3 = ((FM << 12) & 0x10000) | ((FM << 15) & 0x100000); > mask4 = ((FM << 18) & 0x1000000) | ((FM << 21) & 0x10000000); > mask1 |= mask2; > mask3 |= mask4; > mask = mask1 | mask3; > the compiler will 'optimise' it to the above before code generation. Indeed it's what it does :-( I believe that the current suggestion is good enough. Gabriel
On Mon, 10 Feb 2014, Gabriel Paubert wrote: > On Fri, Feb 07, 2014 at 02:49:40PM -0600, James Yang wrote: > > On Fri, 7 Feb 2014, Gabriel Paubert wrote: > > > > > Hi Stephen, > > > > > > On Fri, Feb 07, 2014 at 11:27:57AM +1000, Stephen N Chivers wrote: > > > > Gabriel Paubert <paubert@iram.es> wrote on 02/06/2014 07:26:37 PM: > > > > > > > > > From: Gabriel Paubert <paubert@iram.es> > > > > > To: Stephen N Chivers <schivers@csc.com.au> > > > > > Cc: linuxppc-dev@lists.ozlabs.org, Chris Proctor <cproctor@csc.com.au> > > > > > Date: 02/06/2014 07:26 PM > > > > > Subject: Re: arch/powerpc/math-emu/mtfsf.c - incorrect mask? > > > > > > > > > > On Thu, Feb 06, 2014 at 12:09:00PM +1000, Stephen N Chivers wrote: > > > > > > > > > > > > > > mask = 0; > > > > > > if (FM & (1 << 0)) > > > > > > mask |= 0x0000000f; > > > > > > if (FM & (1 << 1)) > > > > > > mask |= 0x000000f0; > > > > > > if (FM & (1 << 2)) > > > > > > mask |= 0x00000f00; > > > > > > if (FM & (1 << 3)) > > > > > > mask |= 0x0000f000; > > > > > > if (FM & (1 << 4)) > > > > > > mask |= 0x000f0000; > > > > > > if (FM & (1 << 5)) > > > > > > mask |= 0x00f00000; > > > > > > if (FM & (1 << 6)) > > > > > > mask |= 0x0f000000; > > > > > > if (FM & (1 << 7)) > > > > > > mask |= 0x90000000; > > > > > > > > > > > > With the above mask computation I get consistent results for > > > > > > both the MPC8548 and MPC7410 boards. > Ok, if you have measured that method1 is faster than method2, let us go for it. > I believe method2 would be faster if you had a large out-of-order execution > window, because more parallelism can be extracted from it, but this is probably > only true for high end cores, which do not need FPU emulation in the first place. Yeah, 8548 can issue 2 SFX instructions per cycle which is what the compiler generated. > The other place where we can optimize is the generation of FEX. Here is > my current patch: > > > diff --git a/arch/powerpc/math-emu/mtfsf.c b/arch/powerpc/math-emu/mtfsf.c > index dbce92e..b57b3fa8 100644 > --- a/arch/powerpc/math-emu/mtfsf.c > +++ b/arch/powerpc/math-emu/mtfsf.c > @@ -11,48 +11,35 @@ mtfsf(unsigned int FM, u32 *frB) > u32 mask; > u32 fpscr; > > - if (FM == 0) > + if (likely(FM == 0xff)) > + mask = 0xffffffff; > + else if (unlikely(FM == 0)) > return 0; > - > - if (FM == 0xff) > - mask = 0x9fffffff; > else { > - mask = 0; > - if (FM & (1 << 0)) > - mask |= 0x90000000; > - if (FM & (1 << 1)) > - mask |= 0x0f000000; > - if (FM & (1 << 2)) > - mask |= 0x00f00000; > - if (FM & (1 << 3)) > - mask |= 0x000f0000; > - if (FM & (1 << 4)) > - mask |= 0x0000f000; > - if (FM & (1 << 5)) > - mask |= 0x00000f00; > - if (FM & (1 << 6)) > - mask |= 0x000000f0; > - if (FM & (1 << 7)) > - mask |= 0x0000000f; > + mask = (FM & 1); > + mask |= (FM << 3) & 0x10; > + mask |= (FM << 6) & 0x100; > + mask |= (FM << 9) & 0x1000; > + mask |= (FM << 12) & 0x10000; > + mask |= (FM << 15) & 0x100000; > + mask |= (FM << 18) & 0x1000000; > + mask |= (FM << 21) & 0x10000000; > + mask *= 15; Needs to also mask out bits 1 and 2, they aren't to be set from frB. mask &= 0x9FFFFFFF; > } > > - __FPU_FPSCR &= ~(mask); > - __FPU_FPSCR |= (frB[1] & mask); > + fpscr = ((__FPU_FPSCR & ~mask) | (frB[1] & mask)) & > + ~(FPSCR_VX | FPSCR_FEX); > > - __FPU_FPSCR &= ~(FPSCR_VX); > - if (__FPU_FPSCR & (FPSCR_VXSNAN | FPSCR_VXISI | FPSCR_VXIDI | > + if (fpscr & (FPSCR_VXSNAN | FPSCR_VXISI | FPSCR_VXIDI | > FPSCR_VXZDZ | FPSCR_VXIMZ | FPSCR_VXVC | > FPSCR_VXSOFT | FPSCR_VXSQRT | FPSCR_VXCVI)) > - __FPU_FPSCR |= FPSCR_VX; > - > - fpscr = __FPU_FPSCR; > - fpscr &= ~(FPSCR_FEX); > - if (((fpscr & FPSCR_VX) && (fpscr & FPSCR_VE)) || > - ((fpscr & FPSCR_OX) && (fpscr & FPSCR_OE)) || > - ((fpscr & FPSCR_UX) && (fpscr & FPSCR_UE)) || > - ((fpscr & FPSCR_ZX) && (fpscr & FPSCR_ZE)) || > - ((fpscr & FPSCR_XX) && (fpscr & FPSCR_XE))) > - fpscr |= FPSCR_FEX; > + fpscr |= FPSCR_VX; > + > + /* The bit order of exception enables and exception status > + * is the same. Simply shift and mask to check for enabled > + * exceptions. > + */ > + if (fpscr & (fpscr >> 22) & 0xf8) fpscr |= FPSCR_FEX; > __FPU_FPSCR = fpscr; > > #ifdef DEBUG > mtfsf.c | 57 ++++++++++++++++++++++----------------------------------- > 1 file changed, 22 insertions(+), 35 deletions(-) > > > Notes: > > 1) I'm really unsure on whether 0xff is frequent or not. So the likely() > statement at the beginning may be wrong. Actually, if it is not very likely, > it might be better to remove the special casef for FM==0xff. A look at > GCC sources shows that it never generates a mask of 0xff. From glibc > sources, there vast majority of cases uses 0x1, only isnan() uses 0xff. Can't handle all cases here. > 2) it may be better to remove the check for FM==0, after all, the instruction > efectively becomes a nop, and generating the instruction in the first place > would be too stupid for words. Hmm a heavy no-op. I wonder if it is heavier than a sync. > 3) if might be better to #define the magic constants (22 and 0xf8) used > in the last if statement. > > Gabriel
Hi James, On Mon, Feb 10, 2014 at 11:03:07AM -0600, James Yang wrote: [snipped] > > Ok, if you have measured that method1 is faster than method2, let us go for it. > > I believe method2 would be faster if you had a large out-of-order execution > > window, because more parallelism can be extracted from it, but this is probably > > only true for high end cores, which do not need FPU emulation in the first place. > > Yeah, 8548 can issue 2 SFX instructions per cycle which is what the > compiler generated. > Then it is method1. > > > The other place where we can optimize is the generation of FEX. Here is > > my current patch: > > > > > > diff --git a/arch/powerpc/math-emu/mtfsf.c b/arch/powerpc/math-emu/mtfsf.c > > index dbce92e..b57b3fa8 100644 > > --- a/arch/powerpc/math-emu/mtfsf.c > > +++ b/arch/powerpc/math-emu/mtfsf.c > > @@ -11,48 +11,35 @@ mtfsf(unsigned int FM, u32 *frB) > > u32 mask; > > u32 fpscr; > > > > - if (FM == 0) > > + if (likely(FM == 0xff)) > > + mask = 0xffffffff; > > + else if (unlikely(FM == 0)) > > return 0; > > - > > - if (FM == 0xff) > > - mask = 0x9fffffff; > > else { > > - mask = 0; > > - if (FM & (1 << 0)) > > - mask |= 0x90000000; > > - if (FM & (1 << 1)) > > - mask |= 0x0f000000; > > - if (FM & (1 << 2)) > > - mask |= 0x00f00000; > > - if (FM & (1 << 3)) > > - mask |= 0x000f0000; > > - if (FM & (1 << 4)) > > - mask |= 0x0000f000; > > - if (FM & (1 << 5)) > > - mask |= 0x00000f00; > > - if (FM & (1 << 6)) > > - mask |= 0x000000f0; > > - if (FM & (1 << 7)) > > - mask |= 0x0000000f; > > + mask = (FM & 1); > > + mask |= (FM << 3) & 0x10; > > + mask |= (FM << 6) & 0x100; > > + mask |= (FM << 9) & 0x1000; > > + mask |= (FM << 12) & 0x10000; > > + mask |= (FM << 15) & 0x100000; > > + mask |= (FM << 18) & 0x1000000; > > + mask |= (FM << 21) & 0x10000000; > > + mask *= 15; > > > Needs to also mask out bits 1 and 2, they aren't to be set from frB. > > mask &= 0x9FFFFFFF; > > Look at the following lines: > > > > } > > > > - __FPU_FPSCR &= ~(mask); > > - __FPU_FPSCR |= (frB[1] & mask); > > + fpscr = ((__FPU_FPSCR & ~mask) | (frB[1] & mask)) & > > + ~(FPSCR_VX | FPSCR_FEX); It's here (masking FPSCR_VX and FPSCR_FEX). Actually the previous code was redundant, it cleared FEX and VX in the mask computation and later again when recomputing them. Clearing them once should be enough. > > > > - __FPU_FPSCR &= ~(FPSCR_VX); > > - if (__FPU_FPSCR & (FPSCR_VXSNAN | FPSCR_VXISI | FPSCR_VXIDI | > > + if (fpscr & (FPSCR_VXSNAN | FPSCR_VXISI | FPSCR_VXIDI | > > FPSCR_VXZDZ | FPSCR_VXIMZ | FPSCR_VXVC | > > FPSCR_VXSOFT | FPSCR_VXSQRT | FPSCR_VXCVI)) > > - __FPU_FPSCR |= FPSCR_VX; > > - > > - fpscr = __FPU_FPSCR; > > - fpscr &= ~(FPSCR_FEX); > > - if (((fpscr & FPSCR_VX) && (fpscr & FPSCR_VE)) || > > - ((fpscr & FPSCR_OX) && (fpscr & FPSCR_OE)) || > > - ((fpscr & FPSCR_UX) && (fpscr & FPSCR_UE)) || > > - ((fpscr & FPSCR_ZX) && (fpscr & FPSCR_ZE)) || > > - ((fpscr & FPSCR_XX) && (fpscr & FPSCR_XE))) > > - fpscr |= FPSCR_FEX; > > + fpscr |= FPSCR_VX; > > + > > + /* The bit order of exception enables and exception status > > + * is the same. Simply shift and mask to check for enabled > > + * exceptions. > > + */ > > + if (fpscr & (fpscr >> 22) & 0xf8) fpscr |= FPSCR_FEX; > > __FPU_FPSCR = fpscr; > > > > #ifdef DEBUG > > mtfsf.c | 57 ++++++++++++++++++++++----------------------------------- > > 1 file changed, 22 insertions(+), 35 deletions(-) > > > > > > Notes: > > > > 1) I'm really unsure on whether 0xff is frequent or not. So the likely() > > statement at the beginning may be wrong. Actually, if it is not very likely, > > it might be better to remove the special casef for FM==0xff. A look at > > GCC sources shows that it never generates a mask of 0xff. From glibc > > sources, there vast majority of cases uses 0x1, only isnan() uses 0xff. > > Can't handle all cases here. That's why I would go for the simplest possible code. Conditionals are expensive and minimizing cache footprint is often the best measure of performance for infrequently used code. With this in mind, I would get rid of all the tests for special FM values and rely on the optimized generic case. > > > 2) it may be better to remove the check for FM==0, after all, the instruction > > effectively becomes a nop, and generating the instruction in the first place > > would be too stupid for words. > > Hmm a heavy no-op. I wonder if it is heavier than a sync. In theory not. It contains the equivalent of several isync (taking an exception and returning from it), but not any synchronization wrt the memory accesses. Gabriel
I have been trial booting a 3.14-rc2 kernel for a 85xx platform (dtbImage). After mounting the root filesystem there are no messages from the init scripts and the serial console is not available for login. In the kernel log messages there is: of_serial f1004500.serial: Unknown serial port found, ignored. The serial nodes in boards dts file are specified as: serial0: serial@4500 { cell-index = <0>; device_type = "serial"; compatible = "fsl,ns16550", "ns16550"; reg = <0x4500 0x100>; clock-frequency = <0>; interrupts = <0x2a 0x2>; interrupt-parent = <&mpic>; }; Reversing the order of the compatible: compatible = "ns16550", "fsl,ns16550"; restores the serial console. Linux-3.13 does not have this behaviour. There are 49 dts files in Linux-3.14-rc2 that have the fsl,ns16550 compatible first. Stephen Chivers, CSC Australia Pty. Ltd.
On Feb 11, 2014, at 2:57 PM, Stephen N Chivers <schivers@csc.com.au> wrote: > I have been trial booting a 3.14-rc2 kernel for a 85xx platform > (dtbImage). > > After mounting the root filesystem there are no messages from the init > scripts > and the serial console is not available for login. > > In the kernel log messages there is: > > of_serial f1004500.serial: Unknown serial port found, ignored. > > The serial nodes in boards dts file are specified as: > > serial0: serial@4500 { > cell-index = <0>; > device_type = "serial"; > compatible = "fsl,ns16550", "ns16550"; > reg = <0x4500 0x100>; > clock-frequency = <0>; > interrupts = <0x2a 0x2>; > interrupt-parent = <&mpic>; > }; > > Reversing the order of the compatible: > > compatible = "ns16550", "fsl,ns16550"; > > restores the serial console. > > Linux-3.13 does not have this behaviour. > > There are 49 dts files in Linux-3.14-rc2 that have the fsl,ns16550 > compatible first. Hmm, Wondering if this caused the issue: commit 105353145eafb3ea919f5cdeb652a9d8f270228e Author: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> Date: Tue Dec 3 14:52:00 2013 +0100 OF: base: match each node compatible against all given matches first - k
On 02/11/2014 11:33 PM, Kumar Gala wrote: > > On Feb 11, 2014, at 2:57 PM, Stephen N Chivers <schivers@csc.com.au> wrote: > >> I have been trial booting a 3.14-rc2 kernel for a 85xx platform >> (dtbImage). >> >> After mounting the root filesystem there are no messages from the init >> scripts >> and the serial console is not available for login. >> >> In the kernel log messages there is: >> >> of_serial f1004500.serial: Unknown serial port found, ignored. >> >> The serial nodes in boards dts file are specified as: >> >> serial0: serial@4500 { >> cell-index = <0>; >> device_type = "serial"; >> compatible = "fsl,ns16550", "ns16550"; >> reg = <0x4500 0x100>; >> clock-frequency = <0>; >> interrupts = <0x2a 0x2>; >> interrupt-parent = <&mpic>; >> }; >> >> Reversing the order of the compatible: >> >> compatible = "ns16550", "fsl,ns16550"; >> >> restores the serial console. >> >> Linux-3.13 does not have this behaviour. >> >> There are 49 dts files in Linux-3.14-rc2 that have the fsl,ns16550 >> compatible first. > > Hmm, > > Wondering if this caused the issue: > > commit 105353145eafb3ea919f5cdeb652a9d8f270228e > Author: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> > Date: Tue Dec 3 14:52:00 2013 +0100 > > OF: base: match each node compatible against all given matches first [adding Arnd on Cc] Could be. I checked tty/serial/of_serial.c and it does not provide a compatible for "fsl,ns16550". Does reverting the patch fix the issue observed? I don't think the missing compatible is causing it, but of_serial provides a DT match for .type = "serial" just to fail later on with the error seen above. The commit in question reorders of_match_device in a way that match table order is not relevant anymore. This can cause it to match .type = "serial" first here. Rather than touching the commit, I suggest to remove the problematic .type = "serial" from the match table. It is of no use anyway. Sebastian
Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> wrote on 02/12/2014 09:51:43 AM: > From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> > To: Kumar Gala <galak@kernel.crashing.org>, Stephen N Chivers > <schivers@csc.com.au> > Cc: linuxppc-dev@lists.ozlabs.org, Chris Proctor > <cproctor@csc.com.au>, devicetree <devicetree@vger.kernel.org>, Arnd > Bergmann <arnd@arndb.de> > Date: 02/12/2014 09:51 AM > Subject: Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files. > > On 02/11/2014 11:33 PM, Kumar Gala wrote: > > > > On Feb 11, 2014, at 2:57 PM, Stephen N Chivers <schivers@csc.com.au> wrote: > > > >> I have been trial booting a 3.14-rc2 kernel for a 85xx platform > >> (dtbImage). > >> > >> After mounting the root filesystem there are no messages from the init > >> scripts > >> and the serial console is not available for login. > >> > >> In the kernel log messages there is: > >> > >> of_serial f1004500.serial: Unknown serial port found, ignored. > >> > >> The serial nodes in boards dts file are specified as: > >> > >> serial0: serial@4500 { > >> cell-index = <0>; > >> device_type = "serial"; > >> compatible = "fsl,ns16550", "ns16550"; > >> reg = <0x4500 0x100>; > >> clock-frequency = <0>; > >> interrupts = <0x2a 0x2>; > >> interrupt-parent = <&mpic>; > >> }; > >> > >> Reversing the order of the compatible: > >> > >> compatible = "ns16550", "fsl,ns16550"; > >> > >> restores the serial console. > >> > >> Linux-3.13 does not have this behaviour. > >> > >> There are 49 dts files in Linux-3.14-rc2 that have the fsl,ns16550 > >> compatible first. > > > > Hmm, > > > > Wondering if this caused the issue: > > > > commit 105353145eafb3ea919f5cdeb652a9d8f270228e > > Author: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> > > Date: Tue Dec 3 14:52:00 2013 +0100 > > > > OF: base: match each node compatible against all given matches first > > [adding Arnd on Cc] > > Could be. I checked tty/serial/of_serial.c and it does not provide a > compatible for "fsl,ns16550". Does reverting the patch fix the issue > observed? > > I don't think the missing compatible is causing it, but of_serial > provides a DT match for .type = "serial" just to fail later on > with the error seen above. > > The commit in question reorders of_match_device in a way that match > table order is not relevant anymore. This can cause it to match > .type = "serial" first here. > > Rather than touching the commit, I suggest to remove the problematic > .type = "serial" from the match table. It is of no use anyway. Deleting the "serial" line from the match table fixes the problem. I tested it for both orderings of compatible. > > Sebastian Thanks, Stephen Chivers, CSC Australia Pty. Ltd.
On Tue, 2014-02-11 at 23:51 +0100, Sebastian Hesselbarth wrote: > On 02/11/2014 11:33 PM, Kumar Gala wrote: > > Hmm, > > > > Wondering if this caused the issue: > > > > commit 105353145eafb3ea919f5cdeb652a9d8f270228e > > Author: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> > > Date: Tue Dec 3 14:52:00 2013 +0100 > > > > OF: base: match each node compatible against all given matches first > > [adding Arnd on Cc] > > Could be. I checked tty/serial/of_serial.c and it does not provide a > compatible for "fsl,ns16550". Does reverting the patch fix the issue > observed? > > I don't think the missing compatible is causing it, but of_serial > provides a DT match for .type = "serial" just to fail later on > with the error seen above. > > The commit in question reorders of_match_device in a way that match > table order is not relevant anymore. This can cause it to match > .type = "serial" first here. > > Rather than touching the commit, I suggest to remove the problematic > .type = "serial" from the match table. It is of no use anyway. Regardless of whether .type = "serial" gets removed, it seems wrong for of_match_node() to accept a .type-only match (or .name, or anything else that doesn't involve .compatible) before it accepts a compatible match other than the first in the compatible property. -Scott
On 02/12/2014 12:41 AM, Scott Wood wrote: > On Tue, 2014-02-11 at 23:51 +0100, Sebastian Hesselbarth wrote: >> On 02/11/2014 11:33 PM, Kumar Gala wrote: >>> Hmm, >>> >>> Wondering if this caused the issue: >>> >>> commit 105353145eafb3ea919f5cdeb652a9d8f270228e >>> Author: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> >>> Date: Tue Dec 3 14:52:00 2013 +0100 >>> >>> OF: base: match each node compatible against all given matches first >> >> [adding Arnd on Cc] >> >> Could be. I checked tty/serial/of_serial.c and it does not provide a >> compatible for "fsl,ns16550". Does reverting the patch fix the issue >> observed? >> >> I don't think the missing compatible is causing it, but of_serial >> provides a DT match for .type = "serial" just to fail later on >> with the error seen above. >> >> The commit in question reorders of_match_device in a way that match >> table order is not relevant anymore. This can cause it to match >> .type = "serial" first here. >> >> Rather than touching the commit, I suggest to remove the problematic >> .type = "serial" from the match table. It is of no use anyway. > > Regardless of whether .type = "serial" gets removed, it seems wrong for > of_match_node() to accept a .type-only match (or .name, or anything else > that doesn't involve .compatible) before it accepts a compatible match > other than the first in the compatible property. Right, I thought about it and came to the same conclusion. I sent a patch a second ago to prefer .compatible != NULL matches over those with .compatible == NULL. Would be great if Stephen can re-test that. If it solves the issue, I can send a patch tomorrow. Sebastian
Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> wrote on 02/12/2014 10:46:36 AM: > From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> > To: Scott Wood <scottwood@freescale.com> > Cc: Kumar Gala <galak@kernel.crashing.org>, Stephen N Chivers > <schivers@csc.com.au>, Chris Proctor <cproctor@csc.com.au>, > linuxppc-dev@lists.ozlabs.org, Arnd Bergmann <arnd@arndb.de>, > devicetree <devicetree@vger.kernel.org> > Date: 02/12/2014 11:04 AM > Subject: Re: Linux-3.14-rc2: Order of serial node compatibles in DTS files. > > On 02/12/2014 12:41 AM, Scott Wood wrote: > > On Tue, 2014-02-11 at 23:51 +0100, Sebastian Hesselbarth wrote: > >> On 02/11/2014 11:33 PM, Kumar Gala wrote: > >>> Hmm, > >>> > >>> Wondering if this caused the issue: > >>> > >>> commit 105353145eafb3ea919f5cdeb652a9d8f270228e > >>> Author: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> > >>> Date: Tue Dec 3 14:52:00 2013 +0100 > >>> > >>> OF: base: match each node compatible against all given matches first > >> > >> [adding Arnd on Cc] > >> > >> Could be. I checked tty/serial/of_serial.c and it does not provide a > >> compatible for "fsl,ns16550". Does reverting the patch fix the issue > >> observed? > >> > >> I don't think the missing compatible is causing it, but of_serial > >> provides a DT match for .type = "serial" just to fail later on > >> with the error seen above. > >> > >> The commit in question reorders of_match_device in a way that match > >> table order is not relevant anymore. This can cause it to match > >> .type = "serial" first here. > >> > >> Rather than touching the commit, I suggest to remove the problematic > >> .type = "serial" from the match table. It is of no use anyway. > > > > Regardless of whether .type = "serial" gets removed, it seems wrong for > > of_match_node() to accept a .type-only match (or .name, or anything else > > that doesn't involve .compatible) before it accepts a compatible match > > other than the first in the compatible property. > > Right, I thought about it and came to the same conclusion. I sent a > patch a second ago to prefer .compatible != NULL matches over those > with .compatible == NULL. > > Would be great if Stephen can re-test that. If it solves the issue, I > can send a patch tomorrow. Done. But, the Interrupt Controller (MPIC) goes AWOL and it is down hill from there. The MPIC is specified in the DTS as: mpic: pic@40000 { interrupt-controller; #address-cells = <0>; #interrupt-cells = <2>; reg = <0x40000 0x40000>; compatible = "chrp,open-pic"; device_type = "open-pic"; big-endian; }; The board support file has the standard mechanism for allocating the PIC: struct mpic *mpic; mpic = mpic_alloc(NULL, 0, 0, 0, 256, " OpenPIC "); BUG_ON(mpic == NULL); mpic_init(mpic); I checked for damage in applying the patch and it has applied correctly. Stephen Chivers, CSC Australia Pty. Ltd.
On 02/12/2014 01:21 AM, Stephen N Chivers wrote: > Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> wrote on > 02/12/2014 10:46:36 AM: > >> From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> >> To: Scott Wood <scottwood@freescale.com> >> Cc: Kumar Gala <galak@kernel.crashing.org>, Stephen N Chivers >> <schivers@csc.com.au>, Chris Proctor <cproctor@csc.com.au>, >> linuxppc-dev@lists.ozlabs.org, Arnd Bergmann <arnd@arndb.de>, >> devicetree <devicetree@vger.kernel.org> >> Date: 02/12/2014 11:04 AM >> Subject: Re: Linux-3.14-rc2: Order of serial node compatibles in DTS > files. >> >> On 02/12/2014 12:41 AM, Scott Wood wrote: >>> >>> Regardless of whether .type = "serial" gets removed, it seems wrong for >>> of_match_node() to accept a .type-only match (or .name, or anything else >>> that doesn't involve .compatible) before it accepts a compatible match >>> other than the first in the compatible property. >> >> Right, I thought about it and came to the same conclusion. I sent a >> patch a second ago to prefer .compatible != NULL matches over those >> with .compatible == NULL. >> >> Would be great if Stephen can re-test that. If it solves the issue, I >> can send a patch tomorrow. > Done. > > But, the Interrupt Controller (MPIC) > goes AWOL and it is down hill from there. > > The MPIC is specified in the DTS as: > > mpic: pic@40000 { > interrupt-controller; > #address-cells = <0>; > #interrupt-cells = <2>; > reg = <0x40000 0x40000>; > compatible = "chrp,open-pic"; > device_type = "open-pic"; > big-endian; > }; > > The board support file has the standard mechanism for allocating > the PIC: > > struct mpic *mpic; > > mpic = mpic_alloc(NULL, 0, 0, 0, 256, " OpenPIC "); > BUG_ON(mpic == NULL); > > mpic_init(mpic); > > I checked for damage in applying the patch and it has applied > correctly. Hmm, I did a mistake in the patch: - while (m->name[0] || m->type[0]) { + while (m->compatible[0] || m->name[0] || m->type[0]) { for the second added match. Otherwise, the matches are not evaluated down to the sentinel but the loop quits on the first match table entry without .name and .type set. Sebastian
On Wed, Feb 12, 2014 at 09:25:24AM +0100, Sebastian Hesselbarth wrote: > > Hmm, I did a mistake in the patch: > > - while (m->name[0] || m->type[0]) { > + while (m->compatible[0] || m->name[0] || m->type[0]) { > > for the second added match. Otherwise, the matches are not > evaluated down to the sentinel but the loop quits on the first > match table entry without .name and .type set. But this is still not right. We also need to skip the matches with .compatible here. Thanks, Kevin > > Sebastian > _______________________________________________ > Linuxppc-dev mailing list > Linuxppc-dev@lists.ozlabs.org > https://lists.ozlabs.org/listinfo/linuxppc-dev
diff --git a/arch/powerpc/math-emu/mtfsf.c b/arch/powerpc/math-emu/mtfsf.c index dbce92e..b57b3fa8 100644 --- a/arch/powerpc/math-emu/mtfsf.c +++ b/arch/powerpc/math-emu/mtfsf.c @@ -11,48 +11,35 @@ mtfsf(unsigned int FM, u32 *frB) u32 mask; u32 fpscr; - if (FM == 0) + if (likely(FM == 0xff)) + mask = 0xffffffff; + else if (unlikely(FM == 0)) return 0; - - if (FM == 0xff) - mask = 0x9fffffff; else { - mask = 0; - if (FM & (1 << 0)) - mask |= 0x90000000; - if (FM & (1 << 1)) - mask |= 0x0f000000; - if (FM & (1 << 2)) - mask |= 0x00f00000; - if (FM & (1 << 3)) - mask |= 0x000f0000; - if (FM & (1 << 4)) - mask |= 0x0000f000; - if (FM & (1 << 5)) - mask |= 0x00000f00; - if (FM & (1 << 6)) - mask |= 0x000000f0; - if (FM & (1 << 7)) - mask |= 0x0000000f; + mask = (FM & 1); + mask |= (FM << 3) & 0x10; + mask |= (FM << 6) & 0x100; + mask |= (FM << 9) & 0x1000; + mask |= (FM << 12) & 0x10000; + mask |= (FM << 15) & 0x100000; + mask |= (FM << 18) & 0x1000000; + mask |= (FM << 21) & 0x10000000; + mask *= 15; } - __FPU_FPSCR &= ~(mask); - __FPU_FPSCR |= (frB[1] & mask); + fpscr = ((__FPU_FPSCR & ~mask) | (frB[1] & mask)) & + ~(FPSCR_VX | FPSCR_FEX); - __FPU_FPSCR &= ~(FPSCR_VX); - if (__FPU_FPSCR & (FPSCR_VXSNAN | FPSCR_VXISI | FPSCR_VXIDI | + if (fpscr & (FPSCR_VXSNAN | FPSCR_VXISI | FPSCR_VXIDI | FPSCR_VXZDZ | FPSCR_VXIMZ | FPSCR_VXVC | FPSCR_VXSOFT | FPSCR_VXSQRT | FPSCR_VXCVI)) - __FPU_FPSCR |= FPSCR_VX; - - fpscr = __FPU_FPSCR; - fpscr &= ~(FPSCR_FEX); - if (((fpscr & FPSCR_VX) && (fpscr & FPSCR_VE)) || - ((fpscr & FPSCR_OX) && (fpscr & FPSCR_OE)) || - ((fpscr & FPSCR_UX) && (fpscr & FPSCR_UE)) || - ((fpscr & FPSCR_ZX) && (fpscr & FPSCR_ZE)) || - ((fpscr & FPSCR_XX) && (fpscr & FPSCR_XE))) - fpscr |= FPSCR_FEX; + fpscr |= FPSCR_VX; + + /* The bit order of exception enables and exception status + * is the same. Simply shift and mask to check for enabled + * exceptions. + */ + if (fpscr & (fpscr >> 22) & 0xf8) fpscr |= FPSCR_FEX; __FPU_FPSCR = fpscr; #ifdef DEBUG