rs6000: __builtin_mma_disassemble_acc() doesn't store elements correctly in LE mode
authorPeter Bergner <bergner@linux.ibm.com>
Wed, 22 Jul 2020 16:44:35 +0000 (11:44 -0500)
committerPeter Bergner <bergner@linux.ibm.com>
Wed, 22 Jul 2020 18:36:28 +0000 (13:36 -0500)
PR96236 shows a problem where we don't correctly store our 512-bit accumulators
correctly in little-endian mode.  The patch below detects when we're doing a
little-endian memory access and stores to the correct memory locations.

2020-07-22  Peter Bergner  <bergner@linux.ibm.com>

gcc/
PR target/96236
* config/rs6000/rs6000-call.c (rs6000_gimple_fold_mma_builtin): Handle
little-endian memory ordering.

gcc/testsuite/
PR target/96236
* gcc.target/powerpc/mma-double-test.c: Update storing results for
correct little-endian ordering.
* gcc.target/powerpc/mma-single-test.c: Likewise.

gcc/config/rs6000/rs6000-call.c
gcc/testsuite/gcc.target/powerpc/mma-double-test.c
gcc/testsuite/gcc.target/powerpc/mma-single-test.c

index 5ec3f2c55add4e2facfff0bdc9a254a68a127130..bb0fdf2968811adecd2895feadd2fb04a6e99f02 100644 (file)
@@ -11154,11 +11154,12 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi)
       tree src_array = build1 (VIEW_CONVERT_EXPR, array_type, src);
       for (unsigned i = 0; i < 4; i++)
        {
+         unsigned index = WORDS_BIG_ENDIAN ? i : 3 - i;
          tree ref = build4 (ARRAY_REF, unsigned_V16QI_type_node, src_array,
                             build_int_cst (size_type_node, i),
                             NULL_TREE, NULL_TREE);
          tree dst = build2 (MEM_REF, unsigned_V16QI_type_node, dst_base,
-                            build_int_cst (dst_type, i * 16));
+                            build_int_cst (dst_type, index * 16));
          gimplify_assign (dst, ref, &new_seq);
        }
       pop_gimplify_context (NULL);
index ac84ae3000485064e2af06d0834c14a0dad36054..044a288ebccf2ec0aa142833c23c1bea9cdbe0d4 100755 (executable)
@@ -12,13 +12,13 @@ typedef double v4sf_t __attribute__ ((vector_size (16)));
 #define SAVE_ACC(ACC, ldc, J)  \
          __builtin_mma_disassemble_acc (result, ACC); \
          rowC = (v4sf_t *) &CO[0*ldc+J]; \
-          rowC[0] += result[3] ; \
+          rowC[0] += result[0]; \
           rowC = (v4sf_t *) &CO[1*ldc+J]; \
-          rowC[0] += result[2] ; \
+          rowC[0] += result[1]; \
           rowC = (v4sf_t *) &CO[2*ldc+J]; \
-          rowC[0] += result[1] ; \
+          rowC[0] += result[2]; \
           rowC = (v4sf_t *) &CO[3*ldc+J]; \
-         rowC[0] += result[0] ;
+         rowC[0] += result[3];
 
 void
 MMA (int m, int n, int k, double *A, double *B, double *C)
index 15369a640258f1fb2ace22985824b20228c91ce3..7e628df45b7b811e929f7ceeefada63224ea9d3e 100755 (executable)
@@ -12,24 +12,24 @@ typedef float v4sf_t __attribute__ ((vector_size (16)));
 #define SAVE_ACC(ACC, ldc,J)  \
          __builtin_mma_disassemble_acc (result, ACC); \
          rowC = (v4sf_t *) &CO[0*ldc+J]; \
-          rowC[0] += result[3] ; \
+          rowC[0] += result[0]; \
           rowC = (v4sf_t *) &CO[1*ldc+J]; \
-          rowC[0] += result[2] ; \
+          rowC[0] += result[1]; \
           rowC = (v4sf_t *) &CO[2*ldc+J]; \
-          rowC[0] += result[1] ; \
+          rowC[0] += result[2]; \
           rowC = (v4sf_t *) &CO[3*ldc+J]; \
-         rowC[0] += result[0] ;
+         rowC[0] += result[3];
 
 #define SAVE_ACC1(ACC,ldc, J)  \
          __builtin_mma_disassemble_acc (result, ACC); \
          rowC = (v4sf_t *) &CO[4* ldc+J]; \
-          rowC[0] += result[3] ; \
+          rowC[0] += result[0]; \
           rowC = (v4sf_t *) &CO[5*ldc+J]; \
-          rowC[0] += result[2] ; \
+          rowC[0] += result[1]; \
           rowC = (v4sf_t *) &CO[6*ldc+J]; \
-          rowC[0] += result[1] ; \
+          rowC[0] += result[2]; \
           rowC = (v4sf_t *) &CO[7*ldc+J]; \
-         rowC[0] += result[0] ;
+         rowC[0] += result[3];
 void
 MMA (int m, int n, int k, float *A, float *B, float *C)
 {