diff mbox

powerpc: handle VSX alignment faults correctly in little-endian mode

Message ID 4B2646F9.4000203@linux.vnet.ibm.com (mailing list archive)
State Accepted, archived
Delegated to: Benjamin Herrenschmidt
Headers show

Commit Message

Neil Campbell Dec. 14, 2009, 2:08 p.m. UTC
This patch fixes the handling of VSX alignment faults in little-endian
mode (the current code assumes the processor is in big-endian mode).

The patch also makes the handlers clear the top 8 bytes of the register
when handling an 8 byte VSX load.

This is based on 2.6.32.

Signed-off-by: Neil Campbell <neilc@linux.vnet.ibm.com>
Cc: <stable@kernel.org>
---

Comments

Neil Campbell Dec. 14, 2009, 2:21 p.m. UTC | #1
Neil Campbell wrote:
> This patch fixes the handling of VSX alignment faults in little-endian
> mode (the current code assumes the processor is in big-endian mode).
> 
> The patch also makes the handlers clear the top 8 bytes of the register
> when handling an 8 byte VSX load.

For the interested, here is a test case that demonstrates the problem.

It should compile with something like:

gcc -m64 -Wa,-mregnames -fno-strict-aliasing -mcpu=power7 -mvsx vsx_le.c -o vsx_le

On an unpatched kernel it reports 8 failures for me, the patch fixes all 8 of these.

---

#include <stdio.h>
#include <string.h>

int fails = 0;

#define LOAD_FUNC(name,inst) \
void test_load_##name(char* input, char* output, int le) \
{ \
  int aligned = (0 == ((long)input & 15)); \
  char* alignstr = aligned?"aligned:   ":"unaligned: "; \
  char* modestr = le?"(le)":"(be)"; \
  int i; \
  char dummydata[16] __attribute__((__aligned__(16))) = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; \
 \
  memset(output, 0, 16); \
 \
  asm ( \
      "mr r15, %[address1]\n\t" \
      "mr r16, %[address2]\n\t" \
      "lvx v0, r0, %[address3]\n\t" /* set register to dummy values */ \
      "cmpwi %[le],1 \n\t" \
      "beq "#name"leversion \n\t" \
      #name" vs32, r0, r15\n\t" \
      "b " #name"store\n\t" \
      #name"leversion: \n\t" \
      "li r0, 171\n\t" \
      "li r3, 20\n\t" \
      "li r4, 1\n\t" \
      "sc\n\t" \
      ".long " inst "\n\t" \
      ".long 0xab000038\n\t" /*"li 0, 171\n\t"*/ \
      ".long 0x14006038\n\t" /*"li 3, 20\n\t"*/ \
      ".long 0x00008038\n\t" /*"li 4, 0\n\t"*/ \
      ".long 0x02000044\n\t" /*"sc\n\t"*/ \
      #name"store: \n\t" \
      "stvx v0,r0,r16 \n\t" \
      : \
      : [address1] "b" (input), [address2] "b" (output), [address3] "b" (dummydata), [le] "b" (le) \
      : "vs32", "r0", "r3", "r4", "r9", "r15", "r16", "cc",  "memory"); \
 \
  fprintf(stderr, #name" %s after %s  ", alignstr, modestr); \
  for (i = 0; i < 16; ++i) \
  { \
    fprintf(stderr, " %x ", output[i]); \
  } \
  fprintf(stderr, "\n"); \
} \


#define STORE_FUNC(name,inst) \
void test_store_##name(char* input, char* output, int le) \
{ \
  int aligned = (0 == ((long)output & 15)); \
  char* alignstr = aligned?"aligned:   ":"unaligned: "; \
  char* modestr = le?"(le)":"(be)"; \
  int i; \
 \
  memset(output, 0, 16); \
 \
  asm ( \
      "mr r15, %[address2]\n\t" \
      "lvx v0, r0, %[address1]\n\t" \
      "cmpwi %[le],1 \n\t" \
      "beq "#name"leversion \n\t" \
      #name" vs32, r0, r15\n\t" \
      "b " #name"end\n\t" \
      #name"leversion: \n\t" \
      "li r0, 171\n\t" \
      "li r3, 20\n\t" \
      "li r4, 1\n\t" \
      "sc\n\t" \
      ".long " inst "\n\t" \
      ".long 0xab000038\n\t" /*"li 0, 171\n\t"*/ \
      ".long 0x14006038\n\t" /*"li 3, 20\n\t"*/ \
      ".long 0x00008038\n\t" /*"li 4, 0\n\t"*/ \
      ".long 0x02000044\n\t" /*"sc\n\t"*/ \
      #name"end: \n\t" \
      : \
      : [address1] "b" (input), [address2] "b" (output), [le] "b" (le) \
      : "vs32", "r0", "r3", "r4", "r9", "r15", "cc",  "memory"); \
 \
  fprintf(stderr, #name" %s after %s  ", alignstr, modestr); \
  for (i = 0; i < 16; ++i) \
  { \
    fprintf(stderr, " %x ", output[i]); \
  } \
  fprintf(stderr, "\n"); \
} \

void do_compare(char* buf1, char* buf2)
{
  if(0 == memcmp(buf1,buf2,16))
  {
    fprintf(stderr, "PASS\n");
  }
  else
  {
    fprintf(stderr, "FAIL\n");
    fails++;
  }
}

STORE_FUNC(stxvw4x, "0x197f007c")
STORE_FUNC(stxvd2x, "0x997f007c")
STORE_FUNC(stxsdx, "0x997d007c")

LOAD_FUNC(lxvw4x, "0x197e007c")
LOAD_FUNC(lxvd2x, "0x997e007c")
LOAD_FUNC(lxsdx, "0x997c007c")
LOAD_FUNC(lxvdsx, "0x997a007c")

int main(int argc, char* argv[])
{
  char inbuf[17] __attribute__((__aligned__(16))) = { -1, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf };
  char alignedinbuf[16] __attribute__((__aligned__(16))) = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf };
  char outbuf[17] __attribute__((__aligned__(16))) = { -1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };
  char alignedoutbuf[16] __attribute__((__aligned__(16))) = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };
  char alignedoutbuf2[16] __attribute__((__aligned__(16))) = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };

  test_store_stxvw4x(alignedinbuf, alignedoutbuf, 0);
  test_store_stxvw4x(alignedinbuf, &outbuf[1], 0);
  do_compare(alignedoutbuf, &outbuf[1]);
  test_store_stxvw4x(alignedinbuf, alignedoutbuf, 1);
  test_store_stxvw4x(alignedinbuf, &outbuf[1], 1);
  do_compare(alignedoutbuf, &outbuf[1]);
  fprintf(stderr, "\n");

  test_store_stxvd2x(alignedinbuf, alignedoutbuf, 0);
  test_store_stxvd2x(alignedinbuf, &outbuf[1], 0);
  do_compare(alignedoutbuf, &outbuf[1]);
  test_store_stxvd2x(alignedinbuf, alignedoutbuf, 1);
  test_store_stxvd2x(alignedinbuf, &outbuf[1], 1);
  do_compare(alignedoutbuf, &outbuf[1]);
  fprintf(stderr, "\n");

  test_store_stxsdx(alignedinbuf, alignedoutbuf, 0);
  test_store_stxsdx(alignedinbuf, &outbuf[1], 0);
  do_compare(alignedoutbuf, &outbuf[1]);
  test_store_stxsdx(alignedinbuf, alignedoutbuf, 1);
  test_store_stxsdx(alignedinbuf, &outbuf[1], 1);
  do_compare(alignedoutbuf, &outbuf[1]);
  fprintf(stderr, "\n");

  test_load_lxvw4x(alignedinbuf, alignedoutbuf, 0);
  test_load_lxvw4x(&inbuf[1], alignedoutbuf2, 0);
  do_compare(alignedoutbuf, alignedoutbuf2);
  test_load_lxvw4x(alignedinbuf, alignedoutbuf, 1);
  test_load_lxvw4x(&inbuf[1], alignedoutbuf2, 1);
  do_compare(alignedoutbuf, alignedoutbuf2);
  fprintf(stderr, "\n");

  test_load_lxvd2x(alignedinbuf, alignedoutbuf, 0);
  test_load_lxvd2x(&inbuf[1], alignedoutbuf2, 0);
  do_compare(alignedoutbuf, alignedoutbuf2);
  test_load_lxvd2x(alignedinbuf, alignedoutbuf, 1);
  test_load_lxvd2x(&inbuf[1], alignedoutbuf2, 1);
  do_compare(alignedoutbuf, alignedoutbuf2);
  fprintf(stderr, "\n");

  test_load_lxsdx(alignedinbuf, alignedoutbuf, 0);
  test_load_lxsdx(&inbuf[1], alignedoutbuf2, 0);
  do_compare(alignedoutbuf, alignedoutbuf2);
  test_load_lxsdx(alignedinbuf, alignedoutbuf, 1);
  test_load_lxsdx(&inbuf[1], alignedoutbuf2, 1);
  do_compare(alignedoutbuf, alignedoutbuf2);
  fprintf(stderr, "\n");

  test_load_lxvdsx(alignedinbuf, alignedoutbuf, 0);
  test_load_lxvdsx(&inbuf[1], alignedoutbuf2, 0);
  do_compare(alignedoutbuf, alignedoutbuf2);
  test_load_lxvdsx(alignedinbuf, alignedoutbuf, 1);
  test_load_lxvdsx(&inbuf[1], alignedoutbuf2, 1);
  do_compare(alignedoutbuf, alignedoutbuf2);
  fprintf(stderr, "\n");

  fprintf(stderr, "%d tests failed\n", fails);
  return fails;
}
Michael Neuling Dec. 14, 2009, 9:05 p.m. UTC | #2
> This patch fixes the handling of VSX alignment faults in little-endian
> mode (the current code assumes the processor is in big-endian mode).
> 
> The patch also makes the handlers clear the top 8 bytes of the register
> when handling an 8 byte VSX load.
> 
> This is based on 2.6.32.
> 
> Signed-off-by: Neil Campbell <neilc@linux.vnet.ibm.com>

Thanks for this Neil!

Acked-by: Michael Neuling <mikey@neuling.org>

> Cc: <stable@kernel.org>
> ---
> diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
> index a5b632e..f0c624f 100644
> --- a/arch/powerpc/kernel/align.c
> +++ b/arch/powerpc/kernel/align.c
> @@ -642,10 +642,14 @@ static int emulate_spe(struct pt_regs *regs, unsigned i
nt reg,
>   */
>  static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
>  		       unsigned int areg, struct pt_regs *regs,
> -		       unsigned int flags, unsigned int length)
> +		       unsigned int flags, unsigned int length,
> +		       unsigned int elsize)
>  {
>  	char *ptr;
> +	unsigned long *lptr;
>  	int ret = 0;
> +	int sw = 0;
> +	int i, j;
>  
>  	flush_vsx_to_thread(current);
>  
> @@ -654,19 +658,35 @@ static int emulate_vsx(unsigned char __user *addr, unsi
gned int reg,
>  	else
>  		ptr = (char *) &current->thread.vr[reg - 32];
>  
> -	if (flags & ST)
> -		ret = __copy_to_user(addr, ptr, length);
> -        else {
> -		if (flags & SPLT){
> -			ret = __copy_from_user(ptr, addr, length);
> -			ptr += length;
> +	lptr = (unsigned long *) ptr;
> +
> +	if (flags & SW)
> +		sw = elsize-1;
> +
> +	for (j = 0; j < length; j += elsize) {
> +		for (i = 0; i < elsize; ++i) {
> +			if (flags & ST)
> +				ret |= __put_user(ptr[i^sw], addr + i);
> +			else
> +				ret |= __get_user(ptr[i^sw], addr + i);
>  		}
> -		ret |= __copy_from_user(ptr, addr, length);
> +		ptr  += elsize;
> +		addr += elsize;
>  	}
> -	if (flags & U)
> -		regs->gpr[areg] = regs->dar;
> -	if (ret)
> +
> +	if (!ret) {
> +		if (flags & U)
> +			regs->gpr[areg] = regs->dar;
> +
> +		/* Splat load copies the same data to top and bottom 8 bytes */
> +		if (flags & SPLT)
> +			lptr[1] = lptr[0];
> +		/* For 8 byte loads, zero the top 8 bytes */
> +		else if (!(flags & ST) && (8 == length))
> +			lptr[1] = 0;
> +	} else
>  		return -EFAULT;
> +
>  	return 1;
>  }
>  #endif
> @@ -767,16 +787,25 @@ int fix_alignment(struct pt_regs *regs)
>  
>  #ifdef CONFIG_VSX
>  	if ((instruction & 0xfc00003e) == 0x7c000018) {
> -		/* Additional register addressing bit (64 VSX vs 32 FPR/GPR */
> +		unsigned int elsize;
> +
> +		/* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */
>  		reg |= (instruction & 0x1) << 5;
>  		/* Simple inline decoder instead of a table */
> +		/* VSX has only 8 and 16 byte memory accesses */
> +		nb = 8;
>  		if (instruction & 0x200)
>  			nb = 16;
> -		else if (instruction & 0x080)
> -			nb = 8;
> -		else
> -			nb = 4;
> +
> +		/* Vector stores in little-endian mode swap individual
> +		   elements, so process them separately */
> +		elsize = 4;
> +		if (instruction & 0x80)
> +			elsize = 8;
> +
>  		flags = 0;
> +		if (regs->msr & MSR_LE)
> +			flags |= SW;
>  		if (instruction & 0x100)
>  			flags |= ST;
>  		if (instruction & 0x040)
> @@ -787,7 +816,7 @@ int fix_alignment(struct pt_regs *regs)
>  			nb = 8;
>  		}
>  		PPC_WARN_EMULATED(vsx);
> -		return emulate_vsx(addr, reg, areg, regs, flags, nb);
> +		return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize);
>  	}
>  #endif
>  	/* A size of 0 indicates an instruction we don't support, with
>
diff mbox

Patch

diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index a5b632e..f0c624f 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -642,10 +642,14 @@  static int emulate_spe(struct pt_regs *regs, unsigned int reg,
  */
 static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
 		       unsigned int areg, struct pt_regs *regs,
-		       unsigned int flags, unsigned int length)
+		       unsigned int flags, unsigned int length,
+		       unsigned int elsize)
 {
 	char *ptr;
+	unsigned long *lptr;
 	int ret = 0;
+	int sw = 0;
+	int i, j;
 
 	flush_vsx_to_thread(current);
 
@@ -654,19 +658,35 @@  static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
 	else
 		ptr = (char *) &current->thread.vr[reg - 32];
 
-	if (flags & ST)
-		ret = __copy_to_user(addr, ptr, length);
-        else {
-		if (flags & SPLT){
-			ret = __copy_from_user(ptr, addr, length);
-			ptr += length;
+	lptr = (unsigned long *) ptr;
+
+	if (flags & SW)
+		sw = elsize-1;
+
+	for (j = 0; j < length; j += elsize) {
+		for (i = 0; i < elsize; ++i) {
+			if (flags & ST)
+				ret |= __put_user(ptr[i^sw], addr + i);
+			else
+				ret |= __get_user(ptr[i^sw], addr + i);
 		}
-		ret |= __copy_from_user(ptr, addr, length);
+		ptr  += elsize;
+		addr += elsize;
 	}
-	if (flags & U)
-		regs->gpr[areg] = regs->dar;
-	if (ret)
+
+	if (!ret) {
+		if (flags & U)
+			regs->gpr[areg] = regs->dar;
+
+		/* Splat load copies the same data to top and bottom 8 bytes */
+		if (flags & SPLT)
+			lptr[1] = lptr[0];
+		/* For 8 byte loads, zero the top 8 bytes */
+		else if (!(flags & ST) && (8 == length))
+			lptr[1] = 0;
+	} else
 		return -EFAULT;
+
 	return 1;
 }
 #endif
@@ -767,16 +787,25 @@  int fix_alignment(struct pt_regs *regs)
 
 #ifdef CONFIG_VSX
 	if ((instruction & 0xfc00003e) == 0x7c000018) {
-		/* Additional register addressing bit (64 VSX vs 32 FPR/GPR */
+		unsigned int elsize;
+
+		/* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */
 		reg |= (instruction & 0x1) << 5;
 		/* Simple inline decoder instead of a table */
+		/* VSX has only 8 and 16 byte memory accesses */
+		nb = 8;
 		if (instruction & 0x200)
 			nb = 16;
-		else if (instruction & 0x080)
-			nb = 8;
-		else
-			nb = 4;
+
+		/* Vector stores in little-endian mode swap individual
+		   elements, so process them separately */
+		elsize = 4;
+		if (instruction & 0x80)
+			elsize = 8;
+
 		flags = 0;
+		if (regs->msr & MSR_LE)
+			flags |= SW;
 		if (instruction & 0x100)
 			flags |= ST;
 		if (instruction & 0x040)
@@ -787,7 +816,7 @@  int fix_alignment(struct pt_regs *regs)
 			nb = 8;
 		}
 		PPC_WARN_EMULATED(vsx);
-		return emulate_vsx(addr, reg, areg, regs, flags, nb);
+		return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize);
 	}
 #endif
 	/* A size of 0 indicates an instruction we don't support, with