diff mbox series

rs6000: __builtin_mma_disassemble_acc() doesn't store elements correctly in LE mode

Message ID dc2f9be0-d4fb-22e3-58d3-01169a3d0e14@linux.ibm.com
State New
Headers show
Series rs6000: __builtin_mma_disassemble_acc() doesn't store elements correctly in LE mode | expand

Commit Message

Peter Bergner July 22, 2020, 5:01 p.m. UTC
PR96236 shows a problem where we don't correctly store our 512-bit accumulators
correctly in little-endian mode.  The patch below detects when we're doing a
little-endian memory access and stores to the correct memory locations.

This passed bootstrap and regtesting with no regressions.  Raji verified
the runnable test case changes work with a fixed compiler.

Ok for trunk and backport to the GCC 10 branch once it reopens?

Peter

gcc/
	PR target/96236
	* config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Handle
	little-endian memory ordering.
    
gcc/testsuite/
	PR target/96236
	* gcc.target/powerpc/mma-double-test.c: Update storing results for
	correct little-endian ordering.
	* gcc.target/powerpc/mma-single-test.c: Likewise.

Comments

Segher Boessenkool July 22, 2020, 6 p.m. UTC | #1
Hi Peter,

On Wed, Jul 22, 2020 at 12:01:21PM -0500, Peter Bergner wrote:
> PR96236 shows a problem where we don't correctly store our 512-bit accumulators
> correctly in little-endian mode.  The patch below detects when we're doing a
> little-endian memory access and stores to the correct memory locations.

> gcc/
> 	PR target/96236
> 	* config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Handle
> 	little-endian memory ordering.
>     
> gcc/testsuite/
> 	PR target/96236
> 	* gcc.target/powerpc/mma-double-test.c: Update storing results for
> 	correct little-endian ordering.
> 	* gcc.target/powerpc/mma-single-test.c: Likewise.

Okay for trunk.  It's not going to benefit from any soak-in time other
than what you have tested already, so it is fine for 10 immediately as
well.  Thanks!


Segher
Peter Bergner July 22, 2020, 7:51 p.m. UTC | #2
On 7/22/20 1:00 PM, Segher Boessenkool wrote:
>> gcc/
>> 	PR target/96236
>> 	* config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Handle
>> 	little-endian memory ordering.
>>     
>> gcc/testsuite/
>> 	PR target/96236
>> 	* gcc.target/powerpc/mma-double-test.c: Update storing results for
>> 	correct little-endian ordering.
>> 	* gcc.target/powerpc/mma-single-test.c: Likewise.
> 
> Okay for trunk.  It's not going to benefit from any soak-in time other
> than what you have tested already, so it is fine for 10 immediately as
> well.  Thanks!

Ok, pushed to trunk.  I'll wait for the branch to reopen before pushing it
there too.  Thanks!

Peter
diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index 5ec3f2c55ad..bb0fdf29688 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -11154,11 +11154,12 @@  rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi)
       tree src_array = build1 (VIEW_CONVERT_EXPR, array_type, src);
       for (unsigned i = 0; i < 4; i++)
 	{
+	  unsigned index = WORDS_BIG_ENDIAN ? i : 3 - i;
 	  tree ref = build4 (ARRAY_REF, unsigned_V16QI_type_node, src_array,
 			     build_int_cst (size_type_node, i),
 			     NULL_TREE, NULL_TREE);
 	  tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base,
-			     build_int_cst (dst_type, i * 16));
+			     build_int_cst (dst_type, index * 16));
 	  gimplify_assign (dst, ref, &new_seq);
 	}
       pop_gimplify_context (NULL);
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
index ac84ae30004..044a288ebcc 100755
--- a/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-double-test.c
@@ -12,13 +12,13 @@  typedef double v4sf_t __attribute__ ((vector_size (16)));
 #define SAVE_ACC(ACC, ldc, J)  \
 	  __builtin_mma_disassemble_acc (result, ACC); \
 	  rowC = (v4sf_t *) &CO[0*ldc+J]; \
-          rowC[0] += result[3] ; \
+          rowC[0] += result[0]; \
           rowC = (v4sf_t *) &CO[1*ldc+J]; \
-          rowC[0] += result[2] ; \
+          rowC[0] += result[1]; \
           rowC = (v4sf_t *) &CO[2*ldc+J]; \
-          rowC[0] += result[1] ; \
+          rowC[0] += result[2]; \
           rowC = (v4sf_t *) &CO[3*ldc+J]; \
-	  rowC[0] += result[0] ;
+	  rowC[0] += result[3];
 
 void
 MMA (int m, int n, int k, double *A, double *B, double *C)
diff --git a/gcc/testsuite/gcc.target/powerpc/mma-single-test.c b/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
index 15369a64025..7e628df45b7 100755
--- a/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
+++ b/gcc/testsuite/gcc.target/powerpc/mma-single-test.c
@@ -12,24 +12,24 @@  typedef float v4sf_t __attribute__ ((vector_size (16)));
 #define SAVE_ACC(ACC, ldc,J)  \
 	  __builtin_mma_disassemble_acc (result, ACC); \
 	  rowC = (v4sf_t *) &CO[0*ldc+J]; \
-          rowC[0] += result[3] ; \
+          rowC[0] += result[0]; \
           rowC = (v4sf_t *) &CO[1*ldc+J]; \
-          rowC[0] += result[2] ; \
+          rowC[0] += result[1]; \
           rowC = (v4sf_t *) &CO[2*ldc+J]; \
-          rowC[0] += result[1] ; \
+          rowC[0] += result[2]; \
           rowC = (v4sf_t *) &CO[3*ldc+J]; \
-	  rowC[0] += result[0] ;
+	  rowC[0] += result[3];
 
 #define SAVE_ACC1(ACC,ldc, J)  \
 	  __builtin_mma_disassemble_acc (result, ACC); \
 	  rowC = (v4sf_t *) &CO[4* ldc+J]; \
-          rowC[0] += result[3] ; \
+          rowC[0] += result[0]; \
           rowC = (v4sf_t *) &CO[5*ldc+J]; \
-          rowC[0] += result[2] ; \
+          rowC[0] += result[1]; \
           rowC = (v4sf_t *) &CO[6*ldc+J]; \
-          rowC[0] += result[1] ; \
+          rowC[0] += result[2]; \
           rowC = (v4sf_t *) &CO[7*ldc+J]; \
-	  rowC[0] += result[0] ;
+	  rowC[0] += result[3];
 void
 MMA (int m, int n, int k, float *A, float *B, float *C)
 {