libgfortran/generated/matmul_r4.c

   1 /* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>
  29
  30
  31 #if defined (HAVE_GFC_REAL_4)
  32
  33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  34    passed to us by the front-end, in which case we call it for large
  35    matrices.  */
  36
  37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  38                           const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
  39                           const int *, const GFC_REAL_4 *, const int *,
  40                           const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
  41                           int, int);
  42
  43 /* The order of loops is different in the case of plain matrix
  44    multiplication C=MATMUL(A,B), and in the frequent special case where
  45    the argument A is the temporary result of a TRANSPOSE intrinsic:
  46    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  47    looking at their strides.
  48
  49    The equivalent Fortran pseudo-code is:
  50
  51    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  52    IF (.NOT.IS_TRANSPOSED(A)) THEN
  53      C = 0
  54      DO J=1,N
  55        DO K=1,COUNT
  56          DO I=1,M
  57            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  58    ELSE
  59      DO J=1,N
  60        DO I=1,M
  61          S = 0
  62          DO K=1,COUNT
  63            S = S+A(I,K)*B(K,J)
  64          C(I,J) = S
  65    ENDIF
  66 */
  67
  68 /* If try_blas is set to a nonzero value, then the matmul function will
  69    see if there is a way to perform the matrix multiplication by a call
  70    to the BLAS gemm function.  */
  71
  72 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
  73         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  74         int blas_limit, blas_call gemm);
  75 export_proto(matmul_r4);
  76
  77 /* Put exhaustive list of possible architectures here here, ORed together.  */
  78
  79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  80
  81 #ifdef HAVE_AVX
  82 static void
  83 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
  84         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  85         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  86 static void
  87 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
  88         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
  89         int blas_limit, blas_call gemm)
  90 {
  91   const GFC_REAL_4 * restrict abase;
  92   const GFC_REAL_4 * restrict bbase;
  93   GFC_REAL_4 * restrict dest;
  94
  95   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
  96   index_type x, y, n, count, xcount, ycount;
  97
  98   assert (GFC_DESCRIPTOR_RANK (a) == 2
  99           || GFC_DESCRIPTOR_RANK (b) == 2);
 100
 101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 102
 103    Either A or B (but not both) can be rank 1:
 104
 105    o One-dimensional argument A is implicitly treated as a row matrix
 106      dimensioned [1,count], so xcount=1.
 107
 108    o One-dimensional argument B is implicitly treated as a column matrix
 109      dimensioned [count, 1], so ycount=1.
 110 */
 111
 112   if (retarray->base_addr == NULL)
 113     {
 114       if (GFC_DESCRIPTOR_RANK (a) == 1)
 115         {
 116           GFC_DIMENSION_SET(retarray->dim[0], 0,
 117                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 118         }
 119       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 120         {
 121           GFC_DIMENSION_SET(retarray->dim[0], 0,
 122                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 123         }
 124       else
 125         {
 126           GFC_DIMENSION_SET(retarray->dim[0], 0,
 127                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 128
 129           GFC_DIMENSION_SET(retarray->dim[1], 0,
 130                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 131                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 132         }
 133
 134       retarray->base_addr
 135         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
 136       retarray->offset = 0;
 137     }
 138   else if (unlikely (compile_options.bounds_check))
 139     {
 140       index_type ret_extent, arg_extent;
 141
 142       if (GFC_DESCRIPTOR_RANK (a) == 1)
 143         {
 144           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 145           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 146           if (arg_extent != ret_extent)
 147             runtime_error ("Incorrect extent in return array in"
 148                            " MATMUL intrinsic: is %ld, should be %ld",
 149                            (long int) ret_extent, (long int) arg_extent);
 150         }
 151       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 152         {
 153           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 154           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 155           if (arg_extent != ret_extent)
 156             runtime_error ("Incorrect extent in return array in"
 157                            " MATMUL intrinsic: is %ld, should be %ld",
 158                            (long int) ret_extent, (long int) arg_extent);
 159         }
 160       else
 161         {
 162           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 163           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 164           if (arg_extent != ret_extent)
 165             runtime_error ("Incorrect extent in return array in"
 166                            " MATMUL intrinsic for dimension 1:"
 167                            " is %ld, should be %ld",
 168                            (long int) ret_extent, (long int) arg_extent);
 169
 170           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 171           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 172           if (arg_extent != ret_extent)
 173             runtime_error ("Incorrect extent in return array in"
 174                            " MATMUL intrinsic for dimension 2:"
 175                            " is %ld, should be %ld",
 176                            (long int) ret_extent, (long int) arg_extent);
 177         }
 178     }
 179
 180
 181   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 182     {
 183       /* One-dimensional result may be addressed in the code below
 184          either as a row or a column matrix. We want both cases to
 185          work. */
 186       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 187     }
 188   else
 189     {
 190       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 191       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 192     }
 193
 194
 195   if (GFC_DESCRIPTOR_RANK (a) == 1)
 196     {
 197       /* Treat it as a a row matrix A[1,count]. */
 198       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 199       aystride = 1;
 200
 201       xcount = 1;
 202       count = GFC_DESCRIPTOR_EXTENT(a,0);
 203     }
 204   else
 205     {
 206       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 207       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 208
 209       count = GFC_DESCRIPTOR_EXTENT(a,1);
 210       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 211     }
 212
 213   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 214     {
 215       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 216         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 217     }
 218
 219   if (GFC_DESCRIPTOR_RANK (b) == 1)
 220     {
 221       /* Treat it as a column matrix B[count,1] */
 222       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 223
 224       /* bystride should never be used for 1-dimensional b.
 225          in case it is we want it to cause a segfault, rather than
 226          an incorrect result. */
 227       bystride = 0xDEADBEEF;
 228       ycount = 1;
 229     }
 230   else
 231     {
 232       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 233       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 234       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 235     }
 236
 237   abase = a->base_addr;
 238   bbase = b->base_addr;
 239   dest = retarray->base_addr;
 240
 241   /* Now that everything is set up, we perform the multiplication
 242      itself.  */
 243
 244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 245 #define min(a,b) ((a) <= (b) ? (a) : (b))
 246 #define max(a,b) ((a) >= (b) ? (a) : (b))
 247
 248   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 249       && (bxstride == 1 || bystride == 1)
 250       && (((float) xcount) * ((float) ycount) * ((float) count)
 251           > POW3(blas_limit)))
 252     {
 253       const int m = xcount, n = ycount, k = count, ldc = rystride;
 254       const GFC_REAL_4 one = 1, zero = 0;
 255       const int lda = (axstride == 1) ? aystride : axstride,
 256                 ldb = (bxstride == 1) ? bystride : bxstride;
 257
 258       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 259         {
 260           assert (gemm != NULL);
 261           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 262                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 263                 &ldc, 1, 1);
 264           return;
 265         }
 266     }
 267
 268   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 269     {
 270       /* This block of code implements a tuned matmul, derived from
 271          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 272
 273                Bo Kagstrom and Per Ling
 274                Department of Computing Science
 275                Umea University
 276                S-901 87 Umea, Sweden
 277
 278          from netlib.org, translated to C, and modified for matmul.m4.  */
 279
 280       const GFC_REAL_4 *a, *b;
 281       GFC_REAL_4 *c;
 282       const index_type m = xcount, n = ycount, k = count;
 283
 284       /* System generated locals */
 285       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 286                  i1, i2, i3, i4, i5, i6;
 287
 288       /* Local variables */
 289       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
 290                  f13, f14, f23, f24, f33, f34, f43, f44;
 291       index_type i, j, l, ii, jj, ll;
 292       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 293
 294       a = abase;
 295       b = bbase;
 296       c = retarray->base_addr;
 297
 298       /* Parameter adjustments */
 299       c_dim1 = rystride;
 300       c_offset = 1 + c_dim1;
 301       c -= c_offset;
 302       a_dim1 = aystride;
 303       a_offset = 1 + a_dim1;
 304       a -= a_offset;
 305       b_dim1 = bystride;
 306       b_offset = 1 + b_dim1;
 307       b -= b_offset;
 308
 309       /* Early exit if possible */
 310       if (m == 0 || n == 0 || k == 0)
 311         return;
 312
 313       /* Adjust size of t1 to what is needed.  */
 314       index_type t1_dim;
 315       t1_dim = (a_dim1-1) * 256 + b_dim1;
 316       if (t1_dim > 65536)
 317         t1_dim = 65536;
 318
 319 #pragma GCC diagnostic push
 320 #pragma GCC diagnostic ignored "-Wvla"
 321       GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
 322 #pragma GCC diagnostic pop
 323
 324       /* Empty c first.  */
 325       for (j=1; j<=n; j++)
 326         for (i=1; i<=m; i++)
 327           c[i + j * c_dim1] = (GFC_REAL_4)0;
 328
 329       /* Start turning the crank. */
 330       i1 = n;
 331       for (jj = 1; jj <= i1; jj += 512)
 332         {
 333           /* Computing MIN */
 334           i2 = 512;
 335           i3 = n - jj + 1;
 336           jsec = min(i2,i3);
 337           ujsec = jsec - jsec % 4;
 338           i2 = k;
 339           for (ll = 1; ll <= i2; ll += 256)
 340             {
 341               /* Computing MIN */
 342               i3 = 256;
 343               i4 = k - ll + 1;
 344               lsec = min(i3,i4);
 345               ulsec = lsec - lsec % 2;
 346
 347               i3 = m;
 348               for (ii = 1; ii <= i3; ii += 256)
 349                 {
 350                   /* Computing MIN */
 351                   i4 = 256;
 352                   i5 = m - ii + 1;
 353                   isec = min(i4,i5);
 354                   uisec = isec - isec % 2;
 355                   i4 = ll + ulsec - 1;
 356                   for (l = ll; l <= i4; l += 2)
 357                     {
 358                       i5 = ii + uisec - 1;
 359                       for (i = ii; i <= i5; i += 2)
 360                         {
 361                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 362                                         a[i + l * a_dim1];
 363                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 364                                         a[i + (l + 1) * a_dim1];
 365                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 366                                         a[i + 1 + l * a_dim1];
 367                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 368                                         a[i + 1 + (l + 1) * a_dim1];
 369                         }
 370                       if (uisec < isec)
 371                         {
 372                           t1[l - ll + 1 + (isec << 8) - 257] =
 373                                     a[ii + isec - 1 + l * a_dim1];
 374                           t1[l - ll + 2 + (isec << 8) - 257] =
 375                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 376                         }
 377                     }
 378                   if (ulsec < lsec)
 379                     {
 380                       i4 = ii + isec - 1;
 381                       for (i = ii; i<= i4; ++i)
 382                         {
 383                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 384                                     a[i + (ll + lsec - 1) * a_dim1];
 385                         }
 386                     }
 387
 388                   uisec = isec - isec % 4;
 389                   i4 = jj + ujsec - 1;
 390                   for (j = jj; j <= i4; j += 4)
 391                     {
 392                       i5 = ii + uisec - 1;
 393                       for (i = ii; i <= i5; i += 4)
 394                         {
 395                           f11 = c[i + j * c_dim1];
 396                           f21 = c[i + 1 + j * c_dim1];
 397                           f12 = c[i + (j + 1) * c_dim1];
 398                           f22 = c[i + 1 + (j + 1) * c_dim1];
 399                           f13 = c[i + (j + 2) * c_dim1];
 400                           f23 = c[i + 1 + (j + 2) * c_dim1];
 401                           f14 = c[i + (j + 3) * c_dim1];
 402                           f24 = c[i + 1 + (j + 3) * c_dim1];
 403                           f31 = c[i + 2 + j * c_dim1];
 404                           f41 = c[i + 3 + j * c_dim1];
 405                           f32 = c[i + 2 + (j + 1) * c_dim1];
 406                           f42 = c[i + 3 + (j + 1) * c_dim1];
 407                           f33 = c[i + 2 + (j + 2) * c_dim1];
 408                           f43 = c[i + 3 + (j + 2) * c_dim1];
 409                           f34 = c[i + 2 + (j + 3) * c_dim1];
 410                           f44 = c[i + 3 + (j + 3) * c_dim1];
 411                           i6 = ll + lsec - 1;
 412                           for (l = ll; l <= i6; ++l)
 413                             {
 414                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 415                                       * b[l + j * b_dim1];
 416                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 417                                       * b[l + j * b_dim1];
 418                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 419                                       * b[l + (j + 1) * b_dim1];
 420                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 421                                       * b[l + (j + 1) * b_dim1];
 422                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 423                                       * b[l + (j + 2) * b_dim1];
 424                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 425                                       * b[l + (j + 2) * b_dim1];
 426                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 427                                       * b[l + (j + 3) * b_dim1];
 428                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 429                                       * b[l + (j + 3) * b_dim1];
 430                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 431                                       * b[l + j * b_dim1];
 432                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 433                                       * b[l + j * b_dim1];
 434                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 435                                       * b[l + (j + 1) * b_dim1];
 436                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 437                                       * b[l + (j + 1) * b_dim1];
 438                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 439                                       * b[l + (j + 2) * b_dim1];
 440                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 441                                       * b[l + (j + 2) * b_dim1];
 442                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 443                                       * b[l + (j + 3) * b_dim1];
 444                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 445                                       * b[l + (j + 3) * b_dim1];
 446                             }
 447                           c[i + j * c_dim1] = f11;
 448                           c[i + 1 + j * c_dim1] = f21;
 449                           c[i + (j + 1) * c_dim1] = f12;
 450                           c[i + 1 + (j + 1) * c_dim1] = f22;
 451                           c[i + (j + 2) * c_dim1] = f13;
 452                           c[i + 1 + (j + 2) * c_dim1] = f23;
 453                           c[i + (j + 3) * c_dim1] = f14;
 454                           c[i + 1 + (j + 3) * c_dim1] = f24;
 455                           c[i + 2 + j * c_dim1] = f31;
 456                           c[i + 3 + j * c_dim1] = f41;
 457                           c[i + 2 + (j + 1) * c_dim1] = f32;
 458                           c[i + 3 + (j + 1) * c_dim1] = f42;
 459                           c[i + 2 + (j + 2) * c_dim1] = f33;
 460                           c[i + 3 + (j + 2) * c_dim1] = f43;
 461                           c[i + 2 + (j + 3) * c_dim1] = f34;
 462                           c[i + 3 + (j + 3) * c_dim1] = f44;
 463                         }
 464                       if (uisec < isec)
 465                         {
 466                           i5 = ii + isec - 1;
 467                           for (i = ii + uisec; i <= i5; ++i)
 468                             {
 469                               f11 = c[i + j * c_dim1];
 470                               f12 = c[i + (j + 1) * c_dim1];
 471                               f13 = c[i + (j + 2) * c_dim1];
 472                               f14 = c[i + (j + 3) * c_dim1];
 473                               i6 = ll + lsec - 1;
 474                               for (l = ll; l <= i6; ++l)
 475                                 {
 476                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 477                                           257] * b[l + j * b_dim1];
 478                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 479                                           257] * b[l + (j + 1) * b_dim1];
 480                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 481                                           257] * b[l + (j + 2) * b_dim1];
 482                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 483                                           257] * b[l + (j + 3) * b_dim1];
 484                                 }
 485                               c[i + j * c_dim1] = f11;
 486                               c[i + (j + 1) * c_dim1] = f12;
 487                               c[i + (j + 2) * c_dim1] = f13;
 488                               c[i + (j + 3) * c_dim1] = f14;
 489                             }
 490                         }
 491                     }
 492                   if (ujsec < jsec)
 493                     {
 494                       i4 = jj + jsec - 1;
 495                       for (j = jj + ujsec; j <= i4; ++j)
 496                         {
 497                           i5 = ii + uisec - 1;
 498                           for (i = ii; i <= i5; i += 4)
 499                             {
 500                               f11 = c[i + j * c_dim1];
 501                               f21 = c[i + 1 + j * c_dim1];
 502                               f31 = c[i + 2 + j * c_dim1];
 503                               f41 = c[i + 3 + j * c_dim1];
 504                               i6 = ll + lsec - 1;
 505                               for (l = ll; l <= i6; ++l)
 506                                 {
 507                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 508                                           257] * b[l + j * b_dim1];
 509                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
 510                                           257] * b[l + j * b_dim1];
 511                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
 512                                           257] * b[l + j * b_dim1];
 513                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
 514                                           257] * b[l + j * b_dim1];
 515                                 }
 516                               c[i + j * c_dim1] = f11;
 517                               c[i + 1 + j * c_dim1] = f21;
 518                               c[i + 2 + j * c_dim1] = f31;
 519                               c[i + 3 + j * c_dim1] = f41;
 520                             }
 521                           i5 = ii + isec - 1;
 522                           for (i = ii + uisec; i <= i5; ++i)
 523                             {
 524                               f11 = c[i + j * c_dim1];
 525                               i6 = ll + lsec - 1;
 526                               for (l = ll; l <= i6; ++l)
 527                                 {
 528                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
 529                                           257] * b[l + j * b_dim1];
 530                                 }
 531                               c[i + j * c_dim1] = f11;
 532                             }
 533                         }
 534                     }
 535                 }
 536             }
 537         }
 538       return;
 539     }
 540   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
 541     {
 542       if (GFC_DESCRIPTOR_RANK (a) != 1)
 543         {
 544           const GFC_REAL_4 *restrict abase_x;
 545           const GFC_REAL_4 *restrict bbase_y;
 546           GFC_REAL_4 *restrict dest_y;
 547           GFC_REAL_4 s;
 548
 549           for (y = 0; y < ycount; y++)
 550             {
 551               bbase_y = &bbase[y*bystride];
 552               dest_y = &dest[y*rystride];
 553               for (x = 0; x < xcount; x++)
 554                 {
 555                   abase_x = &abase[x*axstride];
 556                   s = (GFC_REAL_4) 0;
 557                   for (n = 0; n < count; n++)
 558                     s += abase_x[n] * bbase_y[n];
 559                   dest_y[x] = s;
 560                 }
 561             }
 562         }
 563       else
 564         {
 565           const GFC_REAL_4 *restrict bbase_y;
 566           GFC_REAL_4 s;
 567
 568           for (y = 0; y < ycount; y++)
 569             {
 570               bbase_y = &bbase[y*bystride];
 571               s = (GFC_REAL_4) 0;
 572               for (n = 0; n < count; n++)
 573                 s += abase[n*axstride] * bbase_y[n];
 574               dest[y*rystride] = s;
 575             }
 576         }
 577     }
 578   else if (axstride < aystride)
 579     {
 580       for (y = 0; y < ycount; y++)
 581         for (x = 0; x < xcount; x++)
 582           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
 583
 584       for (y = 0; y < ycount; y++)
 585         for (n = 0; n < count; n++)
 586           for (x = 0; x < xcount; x++)
 587             /* dest[x,y] += a[x,n] * b[n,y] */
 588             dest[x*rxstride + y*rystride] +=
 589                                         abase[x*axstride + n*aystride] *
 590                                         bbase[n*bxstride + y*bystride];
 591     }
 592   else if (GFC_DESCRIPTOR_RANK (a) == 1)
 593     {
 594       const GFC_REAL_4 *restrict bbase_y;
 595       GFC_REAL_4 s;
 596
 597       for (y = 0; y < ycount; y++)
 598         {
 599           bbase_y = &bbase[y*bystride];
 600           s = (GFC_REAL_4) 0;
 601           for (n = 0; n < count; n++)
 602             s += abase[n*axstride] * bbase_y[n*bxstride];
 603           dest[y*rxstride] = s;
 604         }
 605     }
 606   else
 607     {
 608       const GFC_REAL_4 *restrict abase_x;
 609       const GFC_REAL_4 *restrict bbase_y;
 610       GFC_REAL_4 *restrict dest_y;
 611       GFC_REAL_4 s;
 612
 613       for (y = 0; y < ycount; y++)
 614         {
 615           bbase_y = &bbase[y*bystride];
 616           dest_y = &dest[y*rystride];
 617           for (x = 0; x < xcount; x++)
 618             {
 619               abase_x = &abase[x*axstride];
 620               s = (GFC_REAL_4) 0;
 621               for (n = 0; n < count; n++)
 622                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
 623               dest_y[x*rxstride] = s;
 624             }
 625         }
 626     }
 627 }
 628 #undef POW3
 629 #undef min
 630 #undef max
 631
 632 #endif /* HAVE_AVX */
 633
 634 #ifdef HAVE_AVX2
 635 static void
 636 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
 637         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 638         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
 639 static void
 640 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
 641         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 642         int blas_limit, blas_call gemm)
 643 {
 644   const GFC_REAL_4 * restrict abase;
 645   const GFC_REAL_4 * restrict bbase;
 646   GFC_REAL_4 * restrict dest;
 647
 648   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
 649   index_type x, y, n, count, xcount, ycount;
 650
 651   assert (GFC_DESCRIPTOR_RANK (a) == 2
 652           || GFC_DESCRIPTOR_RANK (b) == 2);
 653
 654 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
 655
 656    Either A or B (but not both) can be rank 1:
 657
 658    o One-dimensional argument A is implicitly treated as a row matrix
 659      dimensioned [1,count], so xcount=1.
 660
 661    o One-dimensional argument B is implicitly treated as a column matrix
 662      dimensioned [count, 1], so ycount=1.
 663 */
 664
 665   if (retarray->base_addr == NULL)
 666     {
 667       if (GFC_DESCRIPTOR_RANK (a) == 1)
 668         {
 669           GFC_DIMENSION_SET(retarray->dim[0], 0,
 670                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
 671         }
 672       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 673         {
 674           GFC_DIMENSION_SET(retarray->dim[0], 0,
 675                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 676         }
 677       else
 678         {
 679           GFC_DIMENSION_SET(retarray->dim[0], 0,
 680                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
 681
 682           GFC_DIMENSION_SET(retarray->dim[1], 0,
 683                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
 684                             GFC_DESCRIPTOR_EXTENT(retarray,0));
 685         }
 686
 687       retarray->base_addr
 688         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
 689       retarray->offset = 0;
 690     }
 691   else if (unlikely (compile_options.bounds_check))
 692     {
 693       index_type ret_extent, arg_extent;
 694
 695       if (GFC_DESCRIPTOR_RANK (a) == 1)
 696         {
 697           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 698           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 699           if (arg_extent != ret_extent)
 700             runtime_error ("Incorrect extent in return array in"
 701                            " MATMUL intrinsic: is %ld, should be %ld",
 702                            (long int) ret_extent, (long int) arg_extent);
 703         }
 704       else if (GFC_DESCRIPTOR_RANK (b) == 1)
 705         {
 706           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 707           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 708           if (arg_extent != ret_extent)
 709             runtime_error ("Incorrect extent in return array in"
 710                            " MATMUL intrinsic: is %ld, should be %ld",
 711                            (long int) ret_extent, (long int) arg_extent);
 712         }
 713       else
 714         {
 715           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
 716           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
 717           if (arg_extent != ret_extent)
 718             runtime_error ("Incorrect extent in return array in"
 719                            " MATMUL intrinsic for dimension 1:"
 720                            " is %ld, should be %ld",
 721                            (long int) ret_extent, (long int) arg_extent);
 722
 723           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
 724           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
 725           if (arg_extent != ret_extent)
 726             runtime_error ("Incorrect extent in return array in"
 727                            " MATMUL intrinsic for dimension 2:"
 728                            " is %ld, should be %ld",
 729                            (long int) ret_extent, (long int) arg_extent);
 730         }
 731     }
 732
 733
 734   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
 735     {
 736       /* One-dimensional result may be addressed in the code below
 737          either as a row or a column matrix. We want both cases to
 738          work. */
 739       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 740     }
 741   else
 742     {
 743       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
 744       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
 745     }
 746
 747
 748   if (GFC_DESCRIPTOR_RANK (a) == 1)
 749     {
 750       /* Treat it as a a row matrix A[1,count]. */
 751       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 752       aystride = 1;
 753
 754       xcount = 1;
 755       count = GFC_DESCRIPTOR_EXTENT(a,0);
 756     }
 757   else
 758     {
 759       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
 760       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
 761
 762       count = GFC_DESCRIPTOR_EXTENT(a,1);
 763       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
 764     }
 765
 766   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
 767     {
 768       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
 769         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
 770     }
 771
 772   if (GFC_DESCRIPTOR_RANK (b) == 1)
 773     {
 774       /* Treat it as a column matrix B[count,1] */
 775       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 776
 777       /* bystride should never be used for 1-dimensional b.
 778          in case it is we want it to cause a segfault, rather than
 779          an incorrect result. */
 780       bystride = 0xDEADBEEF;
 781       ycount = 1;
 782     }
 783   else
 784     {
 785       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
 786       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
 787       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
 788     }
 789
 790   abase = a->base_addr;
 791   bbase = b->base_addr;
 792   dest = retarray->base_addr;
 793
 794   /* Now that everything is set up, we perform the multiplication
 795      itself.  */
 796
 797 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
 798 #define min(a,b) ((a) <= (b) ? (a) : (b))
 799 #define max(a,b) ((a) >= (b) ? (a) : (b))
 800
 801   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
 802       && (bxstride == 1 || bystride == 1)
 803       && (((float) xcount) * ((float) ycount) * ((float) count)
 804           > POW3(blas_limit)))
 805     {
 806       const int m = xcount, n = ycount, k = count, ldc = rystride;
 807       const GFC_REAL_4 one = 1, zero = 0;
 808       const int lda = (axstride == 1) ? aystride : axstride,
 809                 ldb = (bxstride == 1) ? bystride : bxstride;
 810
 811       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
 812         {
 813           assert (gemm != NULL);
 814           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
 815                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
 816                 &ldc, 1, 1);
 817           return;
 818         }
 819     }
 820
 821   if (rxstride == 1 && axstride == 1 && bxstride == 1)
 822     {
 823       /* This block of code implements a tuned matmul, derived from
 824          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
 825
 826                Bo Kagstrom and Per Ling
 827                Department of Computing Science
 828                Umea University
 829                S-901 87 Umea, Sweden
 830
 831          from netlib.org, translated to C, and modified for matmul.m4.  */
 832
 833       const GFC_REAL_4 *a, *b;
 834       GFC_REAL_4 *c;
 835       const index_type m = xcount, n = ycount, k = count;
 836
 837       /* System generated locals */
 838       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
 839                  i1, i2, i3, i4, i5, i6;
 840
 841       /* Local variables */
 842       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
 843                  f13, f14, f23, f24, f33, f34, f43, f44;
 844       index_type i, j, l, ii, jj, ll;
 845       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
 846
 847       a = abase;
 848       b = bbase;
 849       c = retarray->base_addr;
 850
 851       /* Parameter adjustments */
 852       c_dim1 = rystride;
 853       c_offset = 1 + c_dim1;
 854       c -= c_offset;
 855       a_dim1 = aystride;
 856       a_offset = 1 + a_dim1;
 857       a -= a_offset;
 858       b_dim1 = bystride;
 859       b_offset = 1 + b_dim1;
 860       b -= b_offset;
 861
 862       /* Early exit if possible */
 863       if (m == 0 || n == 0 || k == 0)
 864         return;
 865
 866       /* Adjust size of t1 to what is needed.  */
 867       index_type t1_dim;
 868       t1_dim = (a_dim1-1) * 256 + b_dim1;
 869       if (t1_dim > 65536)
 870         t1_dim = 65536;
 871
 872 #pragma GCC diagnostic push
 873 #pragma GCC diagnostic ignored "-Wvla"
 874       GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
 875 #pragma GCC diagnostic pop
 876
 877       /* Empty c first.  */
 878       for (j=1; j<=n; j++)
 879         for (i=1; i<=m; i++)
 880           c[i + j * c_dim1] = (GFC_REAL_4)0;
 881
 882       /* Start turning the crank. */
 883       i1 = n;
 884       for (jj = 1; jj <= i1; jj += 512)
 885         {
 886           /* Computing MIN */
 887           i2 = 512;
 888           i3 = n - jj + 1;
 889           jsec = min(i2,i3);
 890           ujsec = jsec - jsec % 4;
 891           i2 = k;
 892           for (ll = 1; ll <= i2; ll += 256)
 893             {
 894               /* Computing MIN */
 895               i3 = 256;
 896               i4 = k - ll + 1;
 897               lsec = min(i3,i4);
 898               ulsec = lsec - lsec % 2;
 899
 900               i3 = m;
 901               for (ii = 1; ii <= i3; ii += 256)
 902                 {
 903                   /* Computing MIN */
 904                   i4 = 256;
 905                   i5 = m - ii + 1;
 906                   isec = min(i4,i5);
 907                   uisec = isec - isec % 2;
 908                   i4 = ll + ulsec - 1;
 909                   for (l = ll; l <= i4; l += 2)
 910                     {
 911                       i5 = ii + uisec - 1;
 912                       for (i = ii; i <= i5; i += 2)
 913                         {
 914                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
 915                                         a[i + l * a_dim1];
 916                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
 917                                         a[i + (l + 1) * a_dim1];
 918                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
 919                                         a[i + 1 + l * a_dim1];
 920                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
 921                                         a[i + 1 + (l + 1) * a_dim1];
 922                         }
 923                       if (uisec < isec)
 924                         {
 925                           t1[l - ll + 1 + (isec << 8) - 257] =
 926                                     a[ii + isec - 1 + l * a_dim1];
 927                           t1[l - ll + 2 + (isec << 8) - 257] =
 928                                     a[ii + isec - 1 + (l + 1) * a_dim1];
 929                         }
 930                     }
 931                   if (ulsec < lsec)
 932                     {
 933                       i4 = ii + isec - 1;
 934                       for (i = ii; i<= i4; ++i)
 935                         {
 936                           t1[lsec + ((i - ii + 1) << 8) - 257] =
 937                                     a[i + (ll + lsec - 1) * a_dim1];
 938                         }
 939                     }
 940
 941                   uisec = isec - isec % 4;
 942                   i4 = jj + ujsec - 1;
 943                   for (j = jj; j <= i4; j += 4)
 944                     {
 945                       i5 = ii + uisec - 1;
 946                       for (i = ii; i <= i5; i += 4)
 947                         {
 948                           f11 = c[i + j * c_dim1];
 949                           f21 = c[i + 1 + j * c_dim1];
 950                           f12 = c[i + (j + 1) * c_dim1];
 951                           f22 = c[i + 1 + (j + 1) * c_dim1];
 952                           f13 = c[i + (j + 2) * c_dim1];
 953                           f23 = c[i + 1 + (j + 2) * c_dim1];
 954                           f14 = c[i + (j + 3) * c_dim1];
 955                           f24 = c[i + 1 + (j + 3) * c_dim1];
 956                           f31 = c[i + 2 + j * c_dim1];
 957                           f41 = c[i + 3 + j * c_dim1];
 958                           f32 = c[i + 2 + (j + 1) * c_dim1];
 959                           f42 = c[i + 3 + (j + 1) * c_dim1];
 960                           f33 = c[i + 2 + (j + 2) * c_dim1];
 961                           f43 = c[i + 3 + (j + 2) * c_dim1];
 962                           f34 = c[i + 2 + (j + 3) * c_dim1];
 963                           f44 = c[i + 3 + (j + 3) * c_dim1];
 964                           i6 = ll + lsec - 1;
 965                           for (l = ll; l <= i6; ++l)
 966                             {
 967                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 968                                       * b[l + j * b_dim1];
 969                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 970                                       * b[l + j * b_dim1];
 971                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 972                                       * b[l + (j + 1) * b_dim1];
 973                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 974                                       * b[l + (j + 1) * b_dim1];
 975                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 976                                       * b[l + (j + 2) * b_dim1];
 977                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 978                                       * b[l + (j + 2) * b_dim1];
 979                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
 980                                       * b[l + (j + 3) * b_dim1];
 981                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
 982                                       * b[l + (j + 3) * b_dim1];
 983                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 984                                       * b[l + j * b_dim1];
 985                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 986                                       * b[l + j * b_dim1];
 987                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 988                                       * b[l + (j + 1) * b_dim1];
 989                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 990                                       * b[l + (j + 1) * b_dim1];
 991                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 992                                       * b[l + (j + 2) * b_dim1];
 993                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 994                                       * b[l + (j + 2) * b_dim1];
 995                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
 996                                       * b[l + (j + 3) * b_dim1];
 997                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
 998                                       * b[l + (j + 3) * b_dim1];
 999                             }
1000                           c[i + j * c_dim1] = f11;
1001                           c[i + 1 + j * c_dim1] = f21;
1002                           c[i + (j + 1) * c_dim1] = f12;
1003                           c[i + 1 + (j + 1) * c_dim1] = f22;
1004                           c[i + (j + 2) * c_dim1] = f13;
1005                           c[i + 1 + (j + 2) * c_dim1] = f23;
1006                           c[i + (j + 3) * c_dim1] = f14;
1007                           c[i + 1 + (j + 3) * c_dim1] = f24;
1008                           c[i + 2 + j * c_dim1] = f31;
1009                           c[i + 3 + j * c_dim1] = f41;
1010                           c[i + 2 + (j + 1) * c_dim1] = f32;
1011                           c[i + 3 + (j + 1) * c_dim1] = f42;
1012                           c[i + 2 + (j + 2) * c_dim1] = f33;
1013                           c[i + 3 + (j + 2) * c_dim1] = f43;
1014                           c[i + 2 + (j + 3) * c_dim1] = f34;
1015                           c[i + 3 + (j + 3) * c_dim1] = f44;
1016                         }
1017                       if (uisec < isec)
1018                         {
1019                           i5 = ii + isec - 1;
1020                           for (i = ii + uisec; i <= i5; ++i)
1021                             {
1022                               f11 = c[i + j * c_dim1];
1023                               f12 = c[i + (j + 1) * c_dim1];
1024                               f13 = c[i + (j + 2) * c_dim1];
1025                               f14 = c[i + (j + 3) * c_dim1];
1026                               i6 = ll + lsec - 1;
1027                               for (l = ll; l <= i6; ++l)
1028                                 {
1029                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1030                                           257] * b[l + j * b_dim1];
1031                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1032                                           257] * b[l + (j + 1) * b_dim1];
1033                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1034                                           257] * b[l + (j + 2) * b_dim1];
1035                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1036                                           257] * b[l + (j + 3) * b_dim1];
1037                                 }
1038                               c[i + j * c_dim1] = f11;
1039                               c[i + (j + 1) * c_dim1] = f12;
1040                               c[i + (j + 2) * c_dim1] = f13;
1041                               c[i + (j + 3) * c_dim1] = f14;
1042                             }
1043                         }
1044                     }
1045                   if (ujsec < jsec)
1046                     {
1047                       i4 = jj + jsec - 1;
1048                       for (j = jj + ujsec; j <= i4; ++j)
1049                         {
1050                           i5 = ii + uisec - 1;
1051                           for (i = ii; i <= i5; i += 4)
1052                             {
1053                               f11 = c[i + j * c_dim1];
1054                               f21 = c[i + 1 + j * c_dim1];
1055                               f31 = c[i + 2 + j * c_dim1];
1056                               f41 = c[i + 3 + j * c_dim1];
1057                               i6 = ll + lsec - 1;
1058                               for (l = ll; l <= i6; ++l)
1059                                 {
1060                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061                                           257] * b[l + j * b_dim1];
1062                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1063                                           257] * b[l + j * b_dim1];
1064                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1065                                           257] * b[l + j * b_dim1];
1066                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1067                                           257] * b[l + j * b_dim1];
1068                                 }
1069                               c[i + j * c_dim1] = f11;
1070                               c[i + 1 + j * c_dim1] = f21;
1071                               c[i + 2 + j * c_dim1] = f31;
1072                               c[i + 3 + j * c_dim1] = f41;
1073                             }
1074                           i5 = ii + isec - 1;
1075                           for (i = ii + uisec; i <= i5; ++i)
1076                             {
1077                               f11 = c[i + j * c_dim1];
1078                               i6 = ll + lsec - 1;
1079                               for (l = ll; l <= i6; ++l)
1080                                 {
1081                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1082                                           257] * b[l + j * b_dim1];
1083                                 }
1084                               c[i + j * c_dim1] = f11;
1085                             }
1086                         }
1087                     }
1088                 }
1089             }
1090         }
1091       return;
1092     }
1093   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1094     {
1095       if (GFC_DESCRIPTOR_RANK (a) != 1)
1096         {
1097           const GFC_REAL_4 *restrict abase_x;
1098           const GFC_REAL_4 *restrict bbase_y;
1099           GFC_REAL_4 *restrict dest_y;
1100           GFC_REAL_4 s;
1101
1102           for (y = 0; y < ycount; y++)
1103             {
1104               bbase_y = &bbase[y*bystride];
1105               dest_y = &dest[y*rystride];
1106               for (x = 0; x < xcount; x++)
1107                 {
1108                   abase_x = &abase[x*axstride];
1109                   s = (GFC_REAL_4) 0;
1110                   for (n = 0; n < count; n++)
1111                     s += abase_x[n] * bbase_y[n];
1112                   dest_y[x] = s;
1113                 }
1114             }
1115         }
1116       else
1117         {
1118           const GFC_REAL_4 *restrict bbase_y;
1119           GFC_REAL_4 s;
1120
1121           for (y = 0; y < ycount; y++)
1122             {
1123               bbase_y = &bbase[y*bystride];
1124               s = (GFC_REAL_4) 0;
1125               for (n = 0; n < count; n++)
1126                 s += abase[n*axstride] * bbase_y[n];
1127               dest[y*rystride] = s;
1128             }
1129         }
1130     }
1131   else if (axstride < aystride)
1132     {
1133       for (y = 0; y < ycount; y++)
1134         for (x = 0; x < xcount; x++)
1135           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1136
1137       for (y = 0; y < ycount; y++)
1138         for (n = 0; n < count; n++)
1139           for (x = 0; x < xcount; x++)
1140             /* dest[x,y] += a[x,n] * b[n,y] */
1141             dest[x*rxstride + y*rystride] +=
1142                                         abase[x*axstride + n*aystride] *
1143                                         bbase[n*bxstride + y*bystride];
1144     }
1145   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1146     {
1147       const GFC_REAL_4 *restrict bbase_y;
1148       GFC_REAL_4 s;
1149
1150       for (y = 0; y < ycount; y++)
1151         {
1152           bbase_y = &bbase[y*bystride];
1153           s = (GFC_REAL_4) 0;
1154           for (n = 0; n < count; n++)
1155             s += abase[n*axstride] * bbase_y[n*bxstride];
1156           dest[y*rxstride] = s;
1157         }
1158     }
1159   else
1160     {
1161       const GFC_REAL_4 *restrict abase_x;
1162       const GFC_REAL_4 *restrict bbase_y;
1163       GFC_REAL_4 *restrict dest_y;
1164       GFC_REAL_4 s;
1165
1166       for (y = 0; y < ycount; y++)
1167         {
1168           bbase_y = &bbase[y*bystride];
1169           dest_y = &dest[y*rystride];
1170           for (x = 0; x < xcount; x++)
1171             {
1172               abase_x = &abase[x*axstride];
1173               s = (GFC_REAL_4) 0;
1174               for (n = 0; n < count; n++)
1175                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1176               dest_y[x*rxstride] = s;
1177             }
1178         }
1179     }
1180 }
1181 #undef POW3
1182 #undef min
1183 #undef max
1184
1185 #endif /* HAVE_AVX2 */
1186
1187 #ifdef HAVE_AVX512F
1188 static void
1189 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1190         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1191         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1192 static void
1193 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1194         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1195         int blas_limit, blas_call gemm)
1196 {
1197   const GFC_REAL_4 * restrict abase;
1198   const GFC_REAL_4 * restrict bbase;
1199   GFC_REAL_4 * restrict dest;
1200
1201   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1202   index_type x, y, n, count, xcount, ycount;
1203
1204   assert (GFC_DESCRIPTOR_RANK (a) == 2
1205           || GFC_DESCRIPTOR_RANK (b) == 2);
1206
1207 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1208
1209    Either A or B (but not both) can be rank 1:
1210
1211    o One-dimensional argument A is implicitly treated as a row matrix
1212      dimensioned [1,count], so xcount=1.
1213
1214    o One-dimensional argument B is implicitly treated as a column matrix
1215      dimensioned [count, 1], so ycount=1.
1216 */
1217
1218   if (retarray->base_addr == NULL)
1219     {
1220       if (GFC_DESCRIPTOR_RANK (a) == 1)
1221         {
1222           GFC_DIMENSION_SET(retarray->dim[0], 0,
1223                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1224         }
1225       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1226         {
1227           GFC_DIMENSION_SET(retarray->dim[0], 0,
1228                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1229         }
1230       else
1231         {
1232           GFC_DIMENSION_SET(retarray->dim[0], 0,
1233                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1234
1235           GFC_DIMENSION_SET(retarray->dim[1], 0,
1236                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1237                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1238         }
1239
1240       retarray->base_addr
1241         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1242       retarray->offset = 0;
1243     }
1244   else if (unlikely (compile_options.bounds_check))
1245     {
1246       index_type ret_extent, arg_extent;
1247
1248       if (GFC_DESCRIPTOR_RANK (a) == 1)
1249         {
1250           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1251           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1252           if (arg_extent != ret_extent)
1253             runtime_error ("Incorrect extent in return array in"
1254                            " MATMUL intrinsic: is %ld, should be %ld",
1255                            (long int) ret_extent, (long int) arg_extent);
1256         }
1257       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258         {
1259           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1260           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1261           if (arg_extent != ret_extent)
1262             runtime_error ("Incorrect extent in return array in"
1263                            " MATMUL intrinsic: is %ld, should be %ld",
1264                            (long int) ret_extent, (long int) arg_extent);
1265         }
1266       else
1267         {
1268           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1269           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1270           if (arg_extent != ret_extent)
1271             runtime_error ("Incorrect extent in return array in"
1272                            " MATMUL intrinsic for dimension 1:"
1273                            " is %ld, should be %ld",
1274                            (long int) ret_extent, (long int) arg_extent);
1275
1276           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1277           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1278           if (arg_extent != ret_extent)
1279             runtime_error ("Incorrect extent in return array in"
1280                            " MATMUL intrinsic for dimension 2:"
1281                            " is %ld, should be %ld",
1282                            (long int) ret_extent, (long int) arg_extent);
1283         }
1284     }
1285
1286
1287   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1288     {
1289       /* One-dimensional result may be addressed in the code below
1290          either as a row or a column matrix. We want both cases to
1291          work. */
1292       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1293     }
1294   else
1295     {
1296       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1297       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1298     }
1299
1300
1301   if (GFC_DESCRIPTOR_RANK (a) == 1)
1302     {
1303       /* Treat it as a a row matrix A[1,count]. */
1304       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1305       aystride = 1;
1306
1307       xcount = 1;
1308       count = GFC_DESCRIPTOR_EXTENT(a,0);
1309     }
1310   else
1311     {
1312       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1313       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1314
1315       count = GFC_DESCRIPTOR_EXTENT(a,1);
1316       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1317     }
1318
1319   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1320     {
1321       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1322         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1323     }
1324
1325   if (GFC_DESCRIPTOR_RANK (b) == 1)
1326     {
1327       /* Treat it as a column matrix B[count,1] */
1328       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1329
1330       /* bystride should never be used for 1-dimensional b.
1331          in case it is we want it to cause a segfault, rather than
1332          an incorrect result. */
1333       bystride = 0xDEADBEEF;
1334       ycount = 1;
1335     }
1336   else
1337     {
1338       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1339       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1340       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1341     }
1342
1343   abase = a->base_addr;
1344   bbase = b->base_addr;
1345   dest = retarray->base_addr;
1346
1347   /* Now that everything is set up, we perform the multiplication
1348      itself.  */
1349
1350 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1351 #define min(a,b) ((a) <= (b) ? (a) : (b))
1352 #define max(a,b) ((a) >= (b) ? (a) : (b))
1353
1354   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1355       && (bxstride == 1 || bystride == 1)
1356       && (((float) xcount) * ((float) ycount) * ((float) count)
1357           > POW3(blas_limit)))
1358     {
1359       const int m = xcount, n = ycount, k = count, ldc = rystride;
1360       const GFC_REAL_4 one = 1, zero = 0;
1361       const int lda = (axstride == 1) ? aystride : axstride,
1362                 ldb = (bxstride == 1) ? bystride : bxstride;
1363
1364       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1365         {
1366           assert (gemm != NULL);
1367           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1368                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1369                 &ldc, 1, 1);
1370           return;
1371         }
1372     }
1373
1374   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1375     {
1376       /* This block of code implements a tuned matmul, derived from
1377          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1378
1379                Bo Kagstrom and Per Ling
1380                Department of Computing Science
1381                Umea University
1382                S-901 87 Umea, Sweden
1383
1384          from netlib.org, translated to C, and modified for matmul.m4.  */
1385
1386       const GFC_REAL_4 *a, *b;
1387       GFC_REAL_4 *c;
1388       const index_type m = xcount, n = ycount, k = count;
1389
1390       /* System generated locals */
1391       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1392                  i1, i2, i3, i4, i5, i6;
1393
1394       /* Local variables */
1395       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
1396                  f13, f14, f23, f24, f33, f34, f43, f44;
1397       index_type i, j, l, ii, jj, ll;
1398       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1399
1400       a = abase;
1401       b = bbase;
1402       c = retarray->base_addr;
1403
1404       /* Parameter adjustments */
1405       c_dim1 = rystride;
1406       c_offset = 1 + c_dim1;
1407       c -= c_offset;
1408       a_dim1 = aystride;
1409       a_offset = 1 + a_dim1;
1410       a -= a_offset;
1411       b_dim1 = bystride;
1412       b_offset = 1 + b_dim1;
1413       b -= b_offset;
1414
1415       /* Early exit if possible */
1416       if (m == 0 || n == 0 || k == 0)
1417         return;
1418
1419       /* Adjust size of t1 to what is needed.  */
1420       index_type t1_dim;
1421       t1_dim = (a_dim1-1) * 256 + b_dim1;
1422       if (t1_dim > 65536)
1423         t1_dim = 65536;
1424
1425 #pragma GCC diagnostic push
1426 #pragma GCC diagnostic ignored "-Wvla"
1427       GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
1428 #pragma GCC diagnostic pop
1429
1430       /* Empty c first.  */
1431       for (j=1; j<=n; j++)
1432         for (i=1; i<=m; i++)
1433           c[i + j * c_dim1] = (GFC_REAL_4)0;
1434
1435       /* Start turning the crank. */
1436       i1 = n;
1437       for (jj = 1; jj <= i1; jj += 512)
1438         {
1439           /* Computing MIN */
1440           i2 = 512;
1441           i3 = n - jj + 1;
1442           jsec = min(i2,i3);
1443           ujsec = jsec - jsec % 4;
1444           i2 = k;
1445           for (ll = 1; ll <= i2; ll += 256)
1446             {
1447               /* Computing MIN */
1448               i3 = 256;
1449               i4 = k - ll + 1;
1450               lsec = min(i3,i4);
1451               ulsec = lsec - lsec % 2;
1452
1453               i3 = m;
1454               for (ii = 1; ii <= i3; ii += 256)
1455                 {
1456                   /* Computing MIN */
1457                   i4 = 256;
1458                   i5 = m - ii + 1;
1459                   isec = min(i4,i5);
1460                   uisec = isec - isec % 2;
1461                   i4 = ll + ulsec - 1;
1462                   for (l = ll; l <= i4; l += 2)
1463                     {
1464                       i5 = ii + uisec - 1;
1465                       for (i = ii; i <= i5; i += 2)
1466                         {
1467                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1468                                         a[i + l * a_dim1];
1469                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1470                                         a[i + (l + 1) * a_dim1];
1471                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1472                                         a[i + 1 + l * a_dim1];
1473                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1474                                         a[i + 1 + (l + 1) * a_dim1];
1475                         }
1476                       if (uisec < isec)
1477                         {
1478                           t1[l - ll + 1 + (isec << 8) - 257] =
1479                                     a[ii + isec - 1 + l * a_dim1];
1480                           t1[l - ll + 2 + (isec << 8) - 257] =
1481                                     a[ii + isec - 1 + (l + 1) * a_dim1];
1482                         }
1483                     }
1484                   if (ulsec < lsec)
1485                     {
1486                       i4 = ii + isec - 1;
1487                       for (i = ii; i<= i4; ++i)
1488                         {
1489                           t1[lsec + ((i - ii + 1) << 8) - 257] =
1490                                     a[i + (ll + lsec - 1) * a_dim1];
1491                         }
1492                     }
1493
1494                   uisec = isec - isec % 4;
1495                   i4 = jj + ujsec - 1;
1496                   for (j = jj; j <= i4; j += 4)
1497                     {
1498                       i5 = ii + uisec - 1;
1499                       for (i = ii; i <= i5; i += 4)
1500                         {
1501                           f11 = c[i + j * c_dim1];
1502                           f21 = c[i + 1 + j * c_dim1];
1503                           f12 = c[i + (j + 1) * c_dim1];
1504                           f22 = c[i + 1 + (j + 1) * c_dim1];
1505                           f13 = c[i + (j + 2) * c_dim1];
1506                           f23 = c[i + 1 + (j + 2) * c_dim1];
1507                           f14 = c[i + (j + 3) * c_dim1];
1508                           f24 = c[i + 1 + (j + 3) * c_dim1];
1509                           f31 = c[i + 2 + j * c_dim1];
1510                           f41 = c[i + 3 + j * c_dim1];
1511                           f32 = c[i + 2 + (j + 1) * c_dim1];
1512                           f42 = c[i + 3 + (j + 1) * c_dim1];
1513                           f33 = c[i + 2 + (j + 2) * c_dim1];
1514                           f43 = c[i + 3 + (j + 2) * c_dim1];
1515                           f34 = c[i + 2 + (j + 3) * c_dim1];
1516                           f44 = c[i + 3 + (j + 3) * c_dim1];
1517                           i6 = ll + lsec - 1;
1518                           for (l = ll; l <= i6; ++l)
1519                             {
1520                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1521                                       * b[l + j * b_dim1];
1522                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1523                                       * b[l + j * b_dim1];
1524                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1525                                       * b[l + (j + 1) * b_dim1];
1526                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1527                                       * b[l + (j + 1) * b_dim1];
1528                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1529                                       * b[l + (j + 2) * b_dim1];
1530                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1531                                       * b[l + (j + 2) * b_dim1];
1532                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1533                                       * b[l + (j + 3) * b_dim1];
1534                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1535                                       * b[l + (j + 3) * b_dim1];
1536                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1537                                       * b[l + j * b_dim1];
1538                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1539                                       * b[l + j * b_dim1];
1540                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1541                                       * b[l + (j + 1) * b_dim1];
1542                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1543                                       * b[l + (j + 1) * b_dim1];
1544                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1545                                       * b[l + (j + 2) * b_dim1];
1546                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1547                                       * b[l + (j + 2) * b_dim1];
1548                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1549                                       * b[l + (j + 3) * b_dim1];
1550                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1551                                       * b[l + (j + 3) * b_dim1];
1552                             }
1553                           c[i + j * c_dim1] = f11;
1554                           c[i + 1 + j * c_dim1] = f21;
1555                           c[i + (j + 1) * c_dim1] = f12;
1556                           c[i + 1 + (j + 1) * c_dim1] = f22;
1557                           c[i + (j + 2) * c_dim1] = f13;
1558                           c[i + 1 + (j + 2) * c_dim1] = f23;
1559                           c[i + (j + 3) * c_dim1] = f14;
1560                           c[i + 1 + (j + 3) * c_dim1] = f24;
1561                           c[i + 2 + j * c_dim1] = f31;
1562                           c[i + 3 + j * c_dim1] = f41;
1563                           c[i + 2 + (j + 1) * c_dim1] = f32;
1564                           c[i + 3 + (j + 1) * c_dim1] = f42;
1565                           c[i + 2 + (j + 2) * c_dim1] = f33;
1566                           c[i + 3 + (j + 2) * c_dim1] = f43;
1567                           c[i + 2 + (j + 3) * c_dim1] = f34;
1568                           c[i + 3 + (j + 3) * c_dim1] = f44;
1569                         }
1570                       if (uisec < isec)
1571                         {
1572                           i5 = ii + isec - 1;
1573                           for (i = ii + uisec; i <= i5; ++i)
1574                             {
1575                               f11 = c[i + j * c_dim1];
1576                               f12 = c[i + (j + 1) * c_dim1];
1577                               f13 = c[i + (j + 2) * c_dim1];
1578                               f14 = c[i + (j + 3) * c_dim1];
1579                               i6 = ll + lsec - 1;
1580                               for (l = ll; l <= i6; ++l)
1581                                 {
1582                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1583                                           257] * b[l + j * b_dim1];
1584                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1585                                           257] * b[l + (j + 1) * b_dim1];
1586                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1587                                           257] * b[l + (j + 2) * b_dim1];
1588                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1589                                           257] * b[l + (j + 3) * b_dim1];
1590                                 }
1591                               c[i + j * c_dim1] = f11;
1592                               c[i + (j + 1) * c_dim1] = f12;
1593                               c[i + (j + 2) * c_dim1] = f13;
1594                               c[i + (j + 3) * c_dim1] = f14;
1595                             }
1596                         }
1597                     }
1598                   if (ujsec < jsec)
1599                     {
1600                       i4 = jj + jsec - 1;
1601                       for (j = jj + ujsec; j <= i4; ++j)
1602                         {
1603                           i5 = ii + uisec - 1;
1604                           for (i = ii; i <= i5; i += 4)
1605                             {
1606                               f11 = c[i + j * c_dim1];
1607                               f21 = c[i + 1 + j * c_dim1];
1608                               f31 = c[i + 2 + j * c_dim1];
1609                               f41 = c[i + 3 + j * c_dim1];
1610                               i6 = ll + lsec - 1;
1611                               for (l = ll; l <= i6; ++l)
1612                                 {
1613                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1614                                           257] * b[l + j * b_dim1];
1615                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1616                                           257] * b[l + j * b_dim1];
1617                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1618                                           257] * b[l + j * b_dim1];
1619                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1620                                           257] * b[l + j * b_dim1];
1621                                 }
1622                               c[i + j * c_dim1] = f11;
1623                               c[i + 1 + j * c_dim1] = f21;
1624                               c[i + 2 + j * c_dim1] = f31;
1625                               c[i + 3 + j * c_dim1] = f41;
1626                             }
1627                           i5 = ii + isec - 1;
1628                           for (i = ii + uisec; i <= i5; ++i)
1629                             {
1630                               f11 = c[i + j * c_dim1];
1631                               i6 = ll + lsec - 1;
1632                               for (l = ll; l <= i6; ++l)
1633                                 {
1634                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1635                                           257] * b[l + j * b_dim1];
1636                                 }
1637                               c[i + j * c_dim1] = f11;
1638                             }
1639                         }
1640                     }
1641                 }
1642             }
1643         }
1644       return;
1645     }
1646   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1647     {
1648       if (GFC_DESCRIPTOR_RANK (a) != 1)
1649         {
1650           const GFC_REAL_4 *restrict abase_x;
1651           const GFC_REAL_4 *restrict bbase_y;
1652           GFC_REAL_4 *restrict dest_y;
1653           GFC_REAL_4 s;
1654
1655           for (y = 0; y < ycount; y++)
1656             {
1657               bbase_y = &bbase[y*bystride];
1658               dest_y = &dest[y*rystride];
1659               for (x = 0; x < xcount; x++)
1660                 {
1661                   abase_x = &abase[x*axstride];
1662                   s = (GFC_REAL_4) 0;
1663                   for (n = 0; n < count; n++)
1664                     s += abase_x[n] * bbase_y[n];
1665                   dest_y[x] = s;
1666                 }
1667             }
1668         }
1669       else
1670         {
1671           const GFC_REAL_4 *restrict bbase_y;
1672           GFC_REAL_4 s;
1673
1674           for (y = 0; y < ycount; y++)
1675             {
1676               bbase_y = &bbase[y*bystride];
1677               s = (GFC_REAL_4) 0;
1678               for (n = 0; n < count; n++)
1679                 s += abase[n*axstride] * bbase_y[n];
1680               dest[y*rystride] = s;
1681             }
1682         }
1683     }
1684   else if (axstride < aystride)
1685     {
1686       for (y = 0; y < ycount; y++)
1687         for (x = 0; x < xcount; x++)
1688           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1689
1690       for (y = 0; y < ycount; y++)
1691         for (n = 0; n < count; n++)
1692           for (x = 0; x < xcount; x++)
1693             /* dest[x,y] += a[x,n] * b[n,y] */
1694             dest[x*rxstride + y*rystride] +=
1695                                         abase[x*axstride + n*aystride] *
1696                                         bbase[n*bxstride + y*bystride];
1697     }
1698   else if (GFC_DESCRIPTOR_RANK (a) == 1)
1699     {
1700       const GFC_REAL_4 *restrict bbase_y;
1701       GFC_REAL_4 s;
1702
1703       for (y = 0; y < ycount; y++)
1704         {
1705           bbase_y = &bbase[y*bystride];
1706           s = (GFC_REAL_4) 0;
1707           for (n = 0; n < count; n++)
1708             s += abase[n*axstride] * bbase_y[n*bxstride];
1709           dest[y*rxstride] = s;
1710         }
1711     }
1712   else
1713     {
1714       const GFC_REAL_4 *restrict abase_x;
1715       const GFC_REAL_4 *restrict bbase_y;
1716       GFC_REAL_4 *restrict dest_y;
1717       GFC_REAL_4 s;
1718
1719       for (y = 0; y < ycount; y++)
1720         {
1721           bbase_y = &bbase[y*bystride];
1722           dest_y = &dest[y*rystride];
1723           for (x = 0; x < xcount; x++)
1724             {
1725               abase_x = &abase[x*axstride];
1726               s = (GFC_REAL_4) 0;
1727               for (n = 0; n < count; n++)
1728                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1729               dest_y[x*rxstride] = s;
1730             }
1731         }
1732     }
1733 }
1734 #undef POW3
1735 #undef min
1736 #undef max
1737
1738 #endif  /* HAVE_AVX512F */
1739
1740 /* Function to fall back to if there is no special processor-specific version.  */
1741 static void
1742 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1743         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1744         int blas_limit, blas_call gemm)
1745 {
1746   const GFC_REAL_4 * restrict abase;
1747   const GFC_REAL_4 * restrict bbase;
1748   GFC_REAL_4 * restrict dest;
1749
1750   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1751   index_type x, y, n, count, xcount, ycount;
1752
1753   assert (GFC_DESCRIPTOR_RANK (a) == 2
1754           || GFC_DESCRIPTOR_RANK (b) == 2);
1755
1756 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1757
1758    Either A or B (but not both) can be rank 1:
1759
1760    o One-dimensional argument A is implicitly treated as a row matrix
1761      dimensioned [1,count], so xcount=1.
1762
1763    o One-dimensional argument B is implicitly treated as a column matrix
1764      dimensioned [count, 1], so ycount=1.
1765 */
1766
1767   if (retarray->base_addr == NULL)
1768     {
1769       if (GFC_DESCRIPTOR_RANK (a) == 1)
1770         {
1771           GFC_DIMENSION_SET(retarray->dim[0], 0,
1772                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1773         }
1774       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1775         {
1776           GFC_DIMENSION_SET(retarray->dim[0], 0,
1777                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1778         }
1779       else
1780         {
1781           GFC_DIMENSION_SET(retarray->dim[0], 0,
1782                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1783
1784           GFC_DIMENSION_SET(retarray->dim[1], 0,
1785                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1786                             GFC_DESCRIPTOR_EXTENT(retarray,0));
1787         }
1788
1789       retarray->base_addr
1790         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1791       retarray->offset = 0;
1792     }
1793   else if (unlikely (compile_options.bounds_check))
1794     {
1795       index_type ret_extent, arg_extent;
1796
1797       if (GFC_DESCRIPTOR_RANK (a) == 1)
1798         {
1799           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1800           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1801           if (arg_extent != ret_extent)
1802             runtime_error ("Incorrect extent in return array in"
1803                            " MATMUL intrinsic: is %ld, should be %ld",
1804                            (long int) ret_extent, (long int) arg_extent);
1805         }
1806       else if (GFC_DESCRIPTOR_RANK (b) == 1)
1807         {
1808           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1809           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1810           if (arg_extent != ret_extent)
1811             runtime_error ("Incorrect extent in return array in"
1812                            " MATMUL intrinsic: is %ld, should be %ld",
1813                            (long int) ret_extent, (long int) arg_extent);
1814         }
1815       else
1816         {
1817           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1818           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1819           if (arg_extent != ret_extent)
1820             runtime_error ("Incorrect extent in return array in"
1821                            " MATMUL intrinsic for dimension 1:"
1822                            " is %ld, should be %ld",
1823                            (long int) ret_extent, (long int) arg_extent);
1824
1825           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1826           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1827           if (arg_extent != ret_extent)
1828             runtime_error ("Incorrect extent in return array in"
1829                            " MATMUL intrinsic for dimension 2:"
1830                            " is %ld, should be %ld",
1831                            (long int) ret_extent, (long int) arg_extent);
1832         }
1833     }
1834
1835
1836   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1837     {
1838       /* One-dimensional result may be addressed in the code below
1839          either as a row or a column matrix. We want both cases to
1840          work. */
1841       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1842     }
1843   else
1844     {
1845       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1846       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1847     }
1848
1849
1850   if (GFC_DESCRIPTOR_RANK (a) == 1)
1851     {
1852       /* Treat it as a a row matrix A[1,count]. */
1853       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1854       aystride = 1;
1855
1856       xcount = 1;
1857       count = GFC_DESCRIPTOR_EXTENT(a,0);
1858     }
1859   else
1860     {
1861       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1862       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1863
1864       count = GFC_DESCRIPTOR_EXTENT(a,1);
1865       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1866     }
1867
1868   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1869     {
1870       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1871         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1872     }
1873
1874   if (GFC_DESCRIPTOR_RANK (b) == 1)
1875     {
1876       /* Treat it as a column matrix B[count,1] */
1877       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1878
1879       /* bystride should never be used for 1-dimensional b.
1880          in case it is we want it to cause a segfault, rather than
1881          an incorrect result. */
1882       bystride = 0xDEADBEEF;
1883       ycount = 1;
1884     }
1885   else
1886     {
1887       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1888       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1889       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1890     }
1891
1892   abase = a->base_addr;
1893   bbase = b->base_addr;
1894   dest = retarray->base_addr;
1895
1896   /* Now that everything is set up, we perform the multiplication
1897      itself.  */
1898
1899 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1900 #define min(a,b) ((a) <= (b) ? (a) : (b))
1901 #define max(a,b) ((a) >= (b) ? (a) : (b))
1902
1903   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1904       && (bxstride == 1 || bystride == 1)
1905       && (((float) xcount) * ((float) ycount) * ((float) count)
1906           > POW3(blas_limit)))
1907     {
1908       const int m = xcount, n = ycount, k = count, ldc = rystride;
1909       const GFC_REAL_4 one = 1, zero = 0;
1910       const int lda = (axstride == 1) ? aystride : axstride,
1911                 ldb = (bxstride == 1) ? bystride : bxstride;
1912
1913       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1914         {
1915           assert (gemm != NULL);
1916           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1917                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1918                 &ldc, 1, 1);
1919           return;
1920         }
1921     }
1922
1923   if (rxstride == 1 && axstride == 1 && bxstride == 1)
1924     {
1925       /* This block of code implements a tuned matmul, derived from
1926          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
1927
1928                Bo Kagstrom and Per Ling
1929                Department of Computing Science
1930                Umea University
1931                S-901 87 Umea, Sweden
1932
1933          from netlib.org, translated to C, and modified for matmul.m4.  */
1934
1935       const GFC_REAL_4 *a, *b;
1936       GFC_REAL_4 *c;
1937       const index_type m = xcount, n = ycount, k = count;
1938
1939       /* System generated locals */
1940       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1941                  i1, i2, i3, i4, i5, i6;
1942
1943       /* Local variables */
1944       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
1945                  f13, f14, f23, f24, f33, f34, f43, f44;
1946       index_type i, j, l, ii, jj, ll;
1947       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1948
1949       a = abase;
1950       b = bbase;
1951       c = retarray->base_addr;
1952
1953       /* Parameter adjustments */
1954       c_dim1 = rystride;
1955       c_offset = 1 + c_dim1;
1956       c -= c_offset;
1957       a_dim1 = aystride;
1958       a_offset = 1 + a_dim1;
1959       a -= a_offset;
1960       b_dim1 = bystride;
1961       b_offset = 1 + b_dim1;
1962       b -= b_offset;
1963
1964       /* Early exit if possible */
1965       if (m == 0 || n == 0 || k == 0)
1966         return;
1967
1968       /* Adjust size of t1 to what is needed.  */
1969       index_type t1_dim;
1970       t1_dim = (a_dim1-1) * 256 + b_dim1;
1971       if (t1_dim > 65536)
1972         t1_dim = 65536;
1973
1974 #pragma GCC diagnostic push
1975 #pragma GCC diagnostic ignored "-Wvla"
1976       GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
1977 #pragma GCC diagnostic pop
1978
1979       /* Empty c first.  */
1980       for (j=1; j<=n; j++)
1981         for (i=1; i<=m; i++)
1982           c[i + j * c_dim1] = (GFC_REAL_4)0;
1983
1984       /* Start turning the crank. */
1985       i1 = n;
1986       for (jj = 1; jj <= i1; jj += 512)
1987         {
1988           /* Computing MIN */
1989           i2 = 512;
1990           i3 = n - jj + 1;
1991           jsec = min(i2,i3);
1992           ujsec = jsec - jsec % 4;
1993           i2 = k;
1994           for (ll = 1; ll <= i2; ll += 256)
1995             {
1996               /* Computing MIN */
1997               i3 = 256;
1998               i4 = k - ll + 1;
1999               lsec = min(i3,i4);
2000               ulsec = lsec - lsec % 2;
2001
2002               i3 = m;
2003               for (ii = 1; ii <= i3; ii += 256)
2004                 {
2005                   /* Computing MIN */
2006                   i4 = 256;
2007                   i5 = m - ii + 1;
2008                   isec = min(i4,i5);
2009                   uisec = isec - isec % 2;
2010                   i4 = ll + ulsec - 1;
2011                   for (l = ll; l <= i4; l += 2)
2012                     {
2013                       i5 = ii + uisec - 1;
2014                       for (i = ii; i <= i5; i += 2)
2015                         {
2016                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2017                                         a[i + l * a_dim1];
2018                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2019                                         a[i + (l + 1) * a_dim1];
2020                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2021                                         a[i + 1 + l * a_dim1];
2022                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2023                                         a[i + 1 + (l + 1) * a_dim1];
2024                         }
2025                       if (uisec < isec)
2026                         {
2027                           t1[l - ll + 1 + (isec << 8) - 257] =
2028                                     a[ii + isec - 1 + l * a_dim1];
2029                           t1[l - ll + 2 + (isec << 8) - 257] =
2030                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2031                         }
2032                     }
2033                   if (ulsec < lsec)
2034                     {
2035                       i4 = ii + isec - 1;
2036                       for (i = ii; i<= i4; ++i)
2037                         {
2038                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2039                                     a[i + (ll + lsec - 1) * a_dim1];
2040                         }
2041                     }
2042
2043                   uisec = isec - isec % 4;
2044                   i4 = jj + ujsec - 1;
2045                   for (j = jj; j <= i4; j += 4)
2046                     {
2047                       i5 = ii + uisec - 1;
2048                       for (i = ii; i <= i5; i += 4)
2049                         {
2050                           f11 = c[i + j * c_dim1];
2051                           f21 = c[i + 1 + j * c_dim1];
2052                           f12 = c[i + (j + 1) * c_dim1];
2053                           f22 = c[i + 1 + (j + 1) * c_dim1];
2054                           f13 = c[i + (j + 2) * c_dim1];
2055                           f23 = c[i + 1 + (j + 2) * c_dim1];
2056                           f14 = c[i + (j + 3) * c_dim1];
2057                           f24 = c[i + 1 + (j + 3) * c_dim1];
2058                           f31 = c[i + 2 + j * c_dim1];
2059                           f41 = c[i + 3 + j * c_dim1];
2060                           f32 = c[i + 2 + (j + 1) * c_dim1];
2061                           f42 = c[i + 3 + (j + 1) * c_dim1];
2062                           f33 = c[i + 2 + (j + 2) * c_dim1];
2063                           f43 = c[i + 3 + (j + 2) * c_dim1];
2064                           f34 = c[i + 2 + (j + 3) * c_dim1];
2065                           f44 = c[i + 3 + (j + 3) * c_dim1];
2066                           i6 = ll + lsec - 1;
2067                           for (l = ll; l <= i6; ++l)
2068                             {
2069                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2070                                       * b[l + j * b_dim1];
2071                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2072                                       * b[l + j * b_dim1];
2073                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2074                                       * b[l + (j + 1) * b_dim1];
2075                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2076                                       * b[l + (j + 1) * b_dim1];
2077                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2078                                       * b[l + (j + 2) * b_dim1];
2079                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2080                                       * b[l + (j + 2) * b_dim1];
2081                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2082                                       * b[l + (j + 3) * b_dim1];
2083                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2084                                       * b[l + (j + 3) * b_dim1];
2085                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2086                                       * b[l + j * b_dim1];
2087                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2088                                       * b[l + j * b_dim1];
2089                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2090                                       * b[l + (j + 1) * b_dim1];
2091                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2092                                       * b[l + (j + 1) * b_dim1];
2093                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2094                                       * b[l + (j + 2) * b_dim1];
2095                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2096                                       * b[l + (j + 2) * b_dim1];
2097                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2098                                       * b[l + (j + 3) * b_dim1];
2099                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2100                                       * b[l + (j + 3) * b_dim1];
2101                             }
2102                           c[i + j * c_dim1] = f11;
2103                           c[i + 1 + j * c_dim1] = f21;
2104                           c[i + (j + 1) * c_dim1] = f12;
2105                           c[i + 1 + (j + 1) * c_dim1] = f22;
2106                           c[i + (j + 2) * c_dim1] = f13;
2107                           c[i + 1 + (j + 2) * c_dim1] = f23;
2108                           c[i + (j + 3) * c_dim1] = f14;
2109                           c[i + 1 + (j + 3) * c_dim1] = f24;
2110                           c[i + 2 + j * c_dim1] = f31;
2111                           c[i + 3 + j * c_dim1] = f41;
2112                           c[i + 2 + (j + 1) * c_dim1] = f32;
2113                           c[i + 3 + (j + 1) * c_dim1] = f42;
2114                           c[i + 2 + (j + 2) * c_dim1] = f33;
2115                           c[i + 3 + (j + 2) * c_dim1] = f43;
2116                           c[i + 2 + (j + 3) * c_dim1] = f34;
2117                           c[i + 3 + (j + 3) * c_dim1] = f44;
2118                         }
2119                       if (uisec < isec)
2120                         {
2121                           i5 = ii + isec - 1;
2122                           for (i = ii + uisec; i <= i5; ++i)
2123                             {
2124                               f11 = c[i + j * c_dim1];
2125                               f12 = c[i + (j + 1) * c_dim1];
2126                               f13 = c[i + (j + 2) * c_dim1];
2127                               f14 = c[i + (j + 3) * c_dim1];
2128                               i6 = ll + lsec - 1;
2129                               for (l = ll; l <= i6; ++l)
2130                                 {
2131                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2132                                           257] * b[l + j * b_dim1];
2133                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2134                                           257] * b[l + (j + 1) * b_dim1];
2135                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2136                                           257] * b[l + (j + 2) * b_dim1];
2137                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2138                                           257] * b[l + (j + 3) * b_dim1];
2139                                 }
2140                               c[i + j * c_dim1] = f11;
2141                               c[i + (j + 1) * c_dim1] = f12;
2142                               c[i + (j + 2) * c_dim1] = f13;
2143                               c[i + (j + 3) * c_dim1] = f14;
2144                             }
2145                         }
2146                     }
2147                   if (ujsec < jsec)
2148                     {
2149                       i4 = jj + jsec - 1;
2150                       for (j = jj + ujsec; j <= i4; ++j)
2151                         {
2152                           i5 = ii + uisec - 1;
2153                           for (i = ii; i <= i5; i += 4)
2154                             {
2155                               f11 = c[i + j * c_dim1];
2156                               f21 = c[i + 1 + j * c_dim1];
2157                               f31 = c[i + 2 + j * c_dim1];
2158                               f41 = c[i + 3 + j * c_dim1];
2159                               i6 = ll + lsec - 1;
2160                               for (l = ll; l <= i6; ++l)
2161                                 {
2162                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2163                                           257] * b[l + j * b_dim1];
2164                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2165                                           257] * b[l + j * b_dim1];
2166                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2167                                           257] * b[l + j * b_dim1];
2168                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2169                                           257] * b[l + j * b_dim1];
2170                                 }
2171                               c[i + j * c_dim1] = f11;
2172                               c[i + 1 + j * c_dim1] = f21;
2173                               c[i + 2 + j * c_dim1] = f31;
2174                               c[i + 3 + j * c_dim1] = f41;
2175                             }
2176                           i5 = ii + isec - 1;
2177                           for (i = ii + uisec; i <= i5; ++i)
2178                             {
2179                               f11 = c[i + j * c_dim1];
2180                               i6 = ll + lsec - 1;
2181                               for (l = ll; l <= i6; ++l)
2182                                 {
2183                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2184                                           257] * b[l + j * b_dim1];
2185                                 }
2186                               c[i + j * c_dim1] = f11;
2187                             }
2188                         }
2189                     }
2190                 }
2191             }
2192         }
2193       return;
2194     }
2195   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2196     {
2197       if (GFC_DESCRIPTOR_RANK (a) != 1)
2198         {
2199           const GFC_REAL_4 *restrict abase_x;
2200           const GFC_REAL_4 *restrict bbase_y;
2201           GFC_REAL_4 *restrict dest_y;
2202           GFC_REAL_4 s;
2203
2204           for (y = 0; y < ycount; y++)
2205             {
2206               bbase_y = &bbase[y*bystride];
2207               dest_y = &dest[y*rystride];
2208               for (x = 0; x < xcount; x++)
2209                 {
2210                   abase_x = &abase[x*axstride];
2211                   s = (GFC_REAL_4) 0;
2212                   for (n = 0; n < count; n++)
2213                     s += abase_x[n] * bbase_y[n];
2214                   dest_y[x] = s;
2215                 }
2216             }
2217         }
2218       else
2219         {
2220           const GFC_REAL_4 *restrict bbase_y;
2221           GFC_REAL_4 s;
2222
2223           for (y = 0; y < ycount; y++)
2224             {
2225               bbase_y = &bbase[y*bystride];
2226               s = (GFC_REAL_4) 0;
2227               for (n = 0; n < count; n++)
2228                 s += abase[n*axstride] * bbase_y[n];
2229               dest[y*rystride] = s;
2230             }
2231         }
2232     }
2233   else if (axstride < aystride)
2234     {
2235       for (y = 0; y < ycount; y++)
2236         for (x = 0; x < xcount; x++)
2237           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2238
2239       for (y = 0; y < ycount; y++)
2240         for (n = 0; n < count; n++)
2241           for (x = 0; x < xcount; x++)
2242             /* dest[x,y] += a[x,n] * b[n,y] */
2243             dest[x*rxstride + y*rystride] +=
2244                                         abase[x*axstride + n*aystride] *
2245                                         bbase[n*bxstride + y*bystride];
2246     }
2247   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2248     {
2249       const GFC_REAL_4 *restrict bbase_y;
2250       GFC_REAL_4 s;
2251
2252       for (y = 0; y < ycount; y++)
2253         {
2254           bbase_y = &bbase[y*bystride];
2255           s = (GFC_REAL_4) 0;
2256           for (n = 0; n < count; n++)
2257             s += abase[n*axstride] * bbase_y[n*bxstride];
2258           dest[y*rxstride] = s;
2259         }
2260     }
2261   else
2262     {
2263       const GFC_REAL_4 *restrict abase_x;
2264       const GFC_REAL_4 *restrict bbase_y;
2265       GFC_REAL_4 *restrict dest_y;
2266       GFC_REAL_4 s;
2267
2268       for (y = 0; y < ycount; y++)
2269         {
2270           bbase_y = &bbase[y*bystride];
2271           dest_y = &dest[y*rystride];
2272           for (x = 0; x < xcount; x++)
2273             {
2274               abase_x = &abase[x*axstride];
2275               s = (GFC_REAL_4) 0;
2276               for (n = 0; n < count; n++)
2277                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2278               dest_y[x*rxstride] = s;
2279             }
2280         }
2281     }
2282 }
2283 #undef POW3
2284 #undef min
2285 #undef max
2286
2287
2288 /* Compiling main function, with selection code for the processor.  */
2289
2290 /* Currently, this is i386 only.  Adjust for other architectures.  */
2291
2292 #include <config/i386/cpuinfo.h>
2293 void matmul_r4 (gfc_array_r4 * const restrict retarray,
2294         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2295         int blas_limit, blas_call gemm)
2296 {
2297   static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2298         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2299         int blas_limit, blas_call gemm);
2300
2301   void (*matmul_fn) (gfc_array_r4 * const restrict retarray,
2302         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2303         int blas_limit, blas_call gemm);
2304
2305   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2306   if (matmul_fn == NULL)
2307     {
2308       matmul_fn = matmul_r4_vanilla;
2309       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2310         {
2311           /* Run down the available processors in order of preference.  */
2312 #ifdef HAVE_AVX512F
2313           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2314             {
2315               matmul_fn = matmul_r4_avx512f;
2316               goto store;
2317             }
2318
2319 #endif  /* HAVE_AVX512F */
2320
2321 #ifdef HAVE_AVX2
2322           if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2323              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2324             {
2325               matmul_fn = matmul_r4_avx2;
2326               goto store;
2327             }
2328
2329 #endif
2330
2331 #ifdef HAVE_AVX
2332           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2333             {
2334               matmul_fn = matmul_r4_avx;
2335               goto store;
2336             }
2337 #endif  /* HAVE_AVX */
2338         }
2339    store:
2340       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2341    }
2342
2343    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2344 }
2345
2346 #else  /* Just the vanilla function.  */
2347
2348 void
2349 matmul_r4 (gfc_array_r4 * const restrict retarray,
2350         gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2351         int blas_limit, blas_call gemm)
2352 {
2353   const GFC_REAL_4 * restrict abase;
2354   const GFC_REAL_4 * restrict bbase;
2355   GFC_REAL_4 * restrict dest;
2356
2357   index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2358   index_type x, y, n, count, xcount, ycount;
2359
2360   assert (GFC_DESCRIPTOR_RANK (a) == 2
2361           || GFC_DESCRIPTOR_RANK (b) == 2);
2362
2363 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2364
2365    Either A or B (but not both) can be rank 1:
2366
2367    o One-dimensional argument A is implicitly treated as a row matrix
2368      dimensioned [1,count], so xcount=1.
2369
2370    o One-dimensional argument B is implicitly treated as a column matrix
2371      dimensioned [count, 1], so ycount=1.
2372 */
2373
2374   if (retarray->base_addr == NULL)
2375     {
2376       if (GFC_DESCRIPTOR_RANK (a) == 1)
2377         {
2378           GFC_DIMENSION_SET(retarray->dim[0], 0,
2379                             GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2380         }
2381       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2382         {
2383           GFC_DIMENSION_SET(retarray->dim[0], 0,
2384                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2385         }
2386       else
2387         {
2388           GFC_DIMENSION_SET(retarray->dim[0], 0,
2389                             GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2390
2391           GFC_DIMENSION_SET(retarray->dim[1], 0,
2392                             GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2393                             GFC_DESCRIPTOR_EXTENT(retarray,0));
2394         }
2395
2396       retarray->base_addr
2397         = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2398       retarray->offset = 0;
2399     }
2400   else if (unlikely (compile_options.bounds_check))
2401     {
2402       index_type ret_extent, arg_extent;
2403
2404       if (GFC_DESCRIPTOR_RANK (a) == 1)
2405         {
2406           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2407           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2408           if (arg_extent != ret_extent)
2409             runtime_error ("Incorrect extent in return array in"
2410                            " MATMUL intrinsic: is %ld, should be %ld",
2411                            (long int) ret_extent, (long int) arg_extent);
2412         }
2413       else if (GFC_DESCRIPTOR_RANK (b) == 1)
2414         {
2415           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2416           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2417           if (arg_extent != ret_extent)
2418             runtime_error ("Incorrect extent in return array in"
2419                            " MATMUL intrinsic: is %ld, should be %ld",
2420                            (long int) ret_extent, (long int) arg_extent);
2421         }
2422       else
2423         {
2424           arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2425           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2426           if (arg_extent != ret_extent)
2427             runtime_error ("Incorrect extent in return array in"
2428                            " MATMUL intrinsic for dimension 1:"
2429                            " is %ld, should be %ld",
2430                            (long int) ret_extent, (long int) arg_extent);
2431
2432           arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2433           ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2434           if (arg_extent != ret_extent)
2435             runtime_error ("Incorrect extent in return array in"
2436                            " MATMUL intrinsic for dimension 2:"
2437                            " is %ld, should be %ld",
2438                            (long int) ret_extent, (long int) arg_extent);
2439         }
2440     }
2441
2442
2443   if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2444     {
2445       /* One-dimensional result may be addressed in the code below
2446          either as a row or a column matrix. We want both cases to
2447          work. */
2448       rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2449     }
2450   else
2451     {
2452       rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2453       rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2454     }
2455
2456
2457   if (GFC_DESCRIPTOR_RANK (a) == 1)
2458     {
2459       /* Treat it as a a row matrix A[1,count]. */
2460       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2461       aystride = 1;
2462
2463       xcount = 1;
2464       count = GFC_DESCRIPTOR_EXTENT(a,0);
2465     }
2466   else
2467     {
2468       axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2469       aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2470
2471       count = GFC_DESCRIPTOR_EXTENT(a,1);
2472       xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2473     }
2474
2475   if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2476     {
2477       if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2478         runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2479     }
2480
2481   if (GFC_DESCRIPTOR_RANK (b) == 1)
2482     {
2483       /* Treat it as a column matrix B[count,1] */
2484       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2485
2486       /* bystride should never be used for 1-dimensional b.
2487          in case it is we want it to cause a segfault, rather than
2488          an incorrect result. */
2489       bystride = 0xDEADBEEF;
2490       ycount = 1;
2491     }
2492   else
2493     {
2494       bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2495       bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2496       ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2497     }
2498
2499   abase = a->base_addr;
2500   bbase = b->base_addr;
2501   dest = retarray->base_addr;
2502
2503   /* Now that everything is set up, we perform the multiplication
2504      itself.  */
2505
2506 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2507 #define min(a,b) ((a) <= (b) ? (a) : (b))
2508 #define max(a,b) ((a) >= (b) ? (a) : (b))
2509
2510   if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2511       && (bxstride == 1 || bystride == 1)
2512       && (((float) xcount) * ((float) ycount) * ((float) count)
2513           > POW3(blas_limit)))
2514     {
2515       const int m = xcount, n = ycount, k = count, ldc = rystride;
2516       const GFC_REAL_4 one = 1, zero = 0;
2517       const int lda = (axstride == 1) ? aystride : axstride,
2518                 ldb = (bxstride == 1) ? bystride : bxstride;
2519
2520       if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2521         {
2522           assert (gemm != NULL);
2523           gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2524                 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2525                 &ldc, 1, 1);
2526           return;
2527         }
2528     }
2529
2530   if (rxstride == 1 && axstride == 1 && bxstride == 1)
2531     {
2532       /* This block of code implements a tuned matmul, derived from
2533          Superscalar GEMM-based level 3 BLAS,  Beta version 0.1
2534
2535                Bo Kagstrom and Per Ling
2536                Department of Computing Science
2537                Umea University
2538                S-901 87 Umea, Sweden
2539
2540          from netlib.org, translated to C, and modified for matmul.m4.  */
2541
2542       const GFC_REAL_4 *a, *b;
2543       GFC_REAL_4 *c;
2544       const index_type m = xcount, n = ycount, k = count;
2545
2546       /* System generated locals */
2547       index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2548                  i1, i2, i3, i4, i5, i6;
2549
2550       /* Local variables */
2551       GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
2552                  f13, f14, f23, f24, f33, f34, f43, f44;
2553       index_type i, j, l, ii, jj, ll;
2554       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2555
2556       a = abase;
2557       b = bbase;
2558       c = retarray->base_addr;
2559
2560       /* Parameter adjustments */
2561       c_dim1 = rystride;
2562       c_offset = 1 + c_dim1;
2563       c -= c_offset;
2564       a_dim1 = aystride;
2565       a_offset = 1 + a_dim1;
2566       a -= a_offset;
2567       b_dim1 = bystride;
2568       b_offset = 1 + b_dim1;
2569       b -= b_offset;
2570
2571       /* Early exit if possible */
2572       if (m == 0 || n == 0 || k == 0)
2573         return;
2574
2575       /* Adjust size of t1 to what is needed.  */
2576       index_type t1_dim;
2577       t1_dim = (a_dim1-1) * 256 + b_dim1;
2578       if (t1_dim > 65536)
2579         t1_dim = 65536;
2580
2581 #pragma GCC diagnostic push
2582 #pragma GCC diagnostic ignored "-Wvla"
2583       GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
2584 #pragma GCC diagnostic pop
2585
2586       /* Empty c first.  */
2587       for (j=1; j<=n; j++)
2588         for (i=1; i<=m; i++)
2589           c[i + j * c_dim1] = (GFC_REAL_4)0;
2590
2591       /* Start turning the crank. */
2592       i1 = n;
2593       for (jj = 1; jj <= i1; jj += 512)
2594         {
2595           /* Computing MIN */
2596           i2 = 512;
2597           i3 = n - jj + 1;
2598           jsec = min(i2,i3);
2599           ujsec = jsec - jsec % 4;
2600           i2 = k;
2601           for (ll = 1; ll <= i2; ll += 256)
2602             {
2603               /* Computing MIN */
2604               i3 = 256;
2605               i4 = k - ll + 1;
2606               lsec = min(i3,i4);
2607               ulsec = lsec - lsec % 2;
2608
2609               i3 = m;
2610               for (ii = 1; ii <= i3; ii += 256)
2611                 {
2612                   /* Computing MIN */
2613                   i4 = 256;
2614                   i5 = m - ii + 1;
2615                   isec = min(i4,i5);
2616                   uisec = isec - isec % 2;
2617                   i4 = ll + ulsec - 1;
2618                   for (l = ll; l <= i4; l += 2)
2619                     {
2620                       i5 = ii + uisec - 1;
2621                       for (i = ii; i <= i5; i += 2)
2622                         {
2623                           t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2624                                         a[i + l * a_dim1];
2625                           t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2626                                         a[i + (l + 1) * a_dim1];
2627                           t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2628                                         a[i + 1 + l * a_dim1];
2629                           t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2630                                         a[i + 1 + (l + 1) * a_dim1];
2631                         }
2632                       if (uisec < isec)
2633                         {
2634                           t1[l - ll + 1 + (isec << 8) - 257] =
2635                                     a[ii + isec - 1 + l * a_dim1];
2636                           t1[l - ll + 2 + (isec << 8) - 257] =
2637                                     a[ii + isec - 1 + (l + 1) * a_dim1];
2638                         }
2639                     }
2640                   if (ulsec < lsec)
2641                     {
2642                       i4 = ii + isec - 1;
2643                       for (i = ii; i<= i4; ++i)
2644                         {
2645                           t1[lsec + ((i - ii + 1) << 8) - 257] =
2646                                     a[i + (ll + lsec - 1) * a_dim1];
2647                         }
2648                     }
2649
2650                   uisec = isec - isec % 4;
2651                   i4 = jj + ujsec - 1;
2652                   for (j = jj; j <= i4; j += 4)
2653                     {
2654                       i5 = ii + uisec - 1;
2655                       for (i = ii; i <= i5; i += 4)
2656                         {
2657                           f11 = c[i + j * c_dim1];
2658                           f21 = c[i + 1 + j * c_dim1];
2659                           f12 = c[i + (j + 1) * c_dim1];
2660                           f22 = c[i + 1 + (j + 1) * c_dim1];
2661                           f13 = c[i + (j + 2) * c_dim1];
2662                           f23 = c[i + 1 + (j + 2) * c_dim1];
2663                           f14 = c[i + (j + 3) * c_dim1];
2664                           f24 = c[i + 1 + (j + 3) * c_dim1];
2665                           f31 = c[i + 2 + j * c_dim1];
2666                           f41 = c[i + 3 + j * c_dim1];
2667                           f32 = c[i + 2 + (j + 1) * c_dim1];
2668                           f42 = c[i + 3 + (j + 1) * c_dim1];
2669                           f33 = c[i + 2 + (j + 2) * c_dim1];
2670                           f43 = c[i + 3 + (j + 2) * c_dim1];
2671                           f34 = c[i + 2 + (j + 3) * c_dim1];
2672                           f44 = c[i + 3 + (j + 3) * c_dim1];
2673                           i6 = ll + lsec - 1;
2674                           for (l = ll; l <= i6; ++l)
2675                             {
2676                               f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2677                                       * b[l + j * b_dim1];
2678                               f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2679                                       * b[l + j * b_dim1];
2680                               f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2681                                       * b[l + (j + 1) * b_dim1];
2682                               f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2683                                       * b[l + (j + 1) * b_dim1];
2684                               f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2685                                       * b[l + (j + 2) * b_dim1];
2686                               f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2687                                       * b[l + (j + 2) * b_dim1];
2688                               f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2689                                       * b[l + (j + 3) * b_dim1];
2690                               f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2691                                       * b[l + (j + 3) * b_dim1];
2692                               f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2693                                       * b[l + j * b_dim1];
2694                               f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2695                                       * b[l + j * b_dim1];
2696                               f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2697                                       * b[l + (j + 1) * b_dim1];
2698                               f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2699                                       * b[l + (j + 1) * b_dim1];
2700                               f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2701                                       * b[l + (j + 2) * b_dim1];
2702                               f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2703                                       * b[l + (j + 2) * b_dim1];
2704                               f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2705                                       * b[l + (j + 3) * b_dim1];
2706                               f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2707                                       * b[l + (j + 3) * b_dim1];
2708                             }
2709                           c[i + j * c_dim1] = f11;
2710                           c[i + 1 + j * c_dim1] = f21;
2711                           c[i + (j + 1) * c_dim1] = f12;
2712                           c[i + 1 + (j + 1) * c_dim1] = f22;
2713                           c[i + (j + 2) * c_dim1] = f13;
2714                           c[i + 1 + (j + 2) * c_dim1] = f23;
2715                           c[i + (j + 3) * c_dim1] = f14;
2716                           c[i + 1 + (j + 3) * c_dim1] = f24;
2717                           c[i + 2 + j * c_dim1] = f31;
2718                           c[i + 3 + j * c_dim1] = f41;
2719                           c[i + 2 + (j + 1) * c_dim1] = f32;
2720                           c[i + 3 + (j + 1) * c_dim1] = f42;
2721                           c[i + 2 + (j + 2) * c_dim1] = f33;
2722                           c[i + 3 + (j + 2) * c_dim1] = f43;
2723                           c[i + 2 + (j + 3) * c_dim1] = f34;
2724                           c[i + 3 + (j + 3) * c_dim1] = f44;
2725                         }
2726                       if (uisec < isec)
2727                         {
2728                           i5 = ii + isec - 1;
2729                           for (i = ii + uisec; i <= i5; ++i)
2730                             {
2731                               f11 = c[i + j * c_dim1];
2732                               f12 = c[i + (j + 1) * c_dim1];
2733                               f13 = c[i + (j + 2) * c_dim1];
2734                               f14 = c[i + (j + 3) * c_dim1];
2735                               i6 = ll + lsec - 1;
2736                               for (l = ll; l <= i6; ++l)
2737                                 {
2738                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2739                                           257] * b[l + j * b_dim1];
2740                                   f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2741                                           257] * b[l + (j + 1) * b_dim1];
2742                                   f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2743                                           257] * b[l + (j + 2) * b_dim1];
2744                                   f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2745                                           257] * b[l + (j + 3) * b_dim1];
2746                                 }
2747                               c[i + j * c_dim1] = f11;
2748                               c[i + (j + 1) * c_dim1] = f12;
2749                               c[i + (j + 2) * c_dim1] = f13;
2750                               c[i + (j + 3) * c_dim1] = f14;
2751                             }
2752                         }
2753                     }
2754                   if (ujsec < jsec)
2755                     {
2756                       i4 = jj + jsec - 1;
2757                       for (j = jj + ujsec; j <= i4; ++j)
2758                         {
2759                           i5 = ii + uisec - 1;
2760                           for (i = ii; i <= i5; i += 4)
2761                             {
2762                               f11 = c[i + j * c_dim1];
2763                               f21 = c[i + 1 + j * c_dim1];
2764                               f31 = c[i + 2 + j * c_dim1];
2765                               f41 = c[i + 3 + j * c_dim1];
2766                               i6 = ll + lsec - 1;
2767                               for (l = ll; l <= i6; ++l)
2768                                 {
2769                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2770                                           257] * b[l + j * b_dim1];
2771                                   f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2772                                           257] * b[l + j * b_dim1];
2773                                   f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2774                                           257] * b[l + j * b_dim1];
2775                                   f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2776                                           257] * b[l + j * b_dim1];
2777                                 }
2778                               c[i + j * c_dim1] = f11;
2779                               c[i + 1 + j * c_dim1] = f21;
2780                               c[i + 2 + j * c_dim1] = f31;
2781                               c[i + 3 + j * c_dim1] = f41;
2782                             }
2783                           i5 = ii + isec - 1;
2784                           for (i = ii + uisec; i <= i5; ++i)
2785                             {
2786                               f11 = c[i + j * c_dim1];
2787                               i6 = ll + lsec - 1;
2788                               for (l = ll; l <= i6; ++l)
2789                                 {
2790                                   f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2791                                           257] * b[l + j * b_dim1];
2792                                 }
2793                               c[i + j * c_dim1] = f11;
2794                             }
2795                         }
2796                     }
2797                 }
2798             }
2799         }
2800       return;
2801     }
2802   else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2803     {
2804       if (GFC_DESCRIPTOR_RANK (a) != 1)
2805         {
2806           const GFC_REAL_4 *restrict abase_x;
2807           const GFC_REAL_4 *restrict bbase_y;
2808           GFC_REAL_4 *restrict dest_y;
2809           GFC_REAL_4 s;
2810
2811           for (y = 0; y < ycount; y++)
2812             {
2813               bbase_y = &bbase[y*bystride];
2814               dest_y = &dest[y*rystride];
2815               for (x = 0; x < xcount; x++)
2816                 {
2817                   abase_x = &abase[x*axstride];
2818                   s = (GFC_REAL_4) 0;
2819                   for (n = 0; n < count; n++)
2820                     s += abase_x[n] * bbase_y[n];
2821                   dest_y[x] = s;
2822                 }
2823             }
2824         }
2825       else
2826         {
2827           const GFC_REAL_4 *restrict bbase_y;
2828           GFC_REAL_4 s;
2829
2830           for (y = 0; y < ycount; y++)
2831             {
2832               bbase_y = &bbase[y*bystride];
2833               s = (GFC_REAL_4) 0;
2834               for (n = 0; n < count; n++)
2835                 s += abase[n*axstride] * bbase_y[n];
2836               dest[y*rystride] = s;
2837             }
2838         }
2839     }
2840   else if (axstride < aystride)
2841     {
2842       for (y = 0; y < ycount; y++)
2843         for (x = 0; x < xcount; x++)
2844           dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2845
2846       for (y = 0; y < ycount; y++)
2847         for (n = 0; n < count; n++)
2848           for (x = 0; x < xcount; x++)
2849             /* dest[x,y] += a[x,n] * b[n,y] */
2850             dest[x*rxstride + y*rystride] +=
2851                                         abase[x*axstride + n*aystride] *
2852                                         bbase[n*bxstride + y*bystride];
2853     }
2854   else if (GFC_DESCRIPTOR_RANK (a) == 1)
2855     {
2856       const GFC_REAL_4 *restrict bbase_y;
2857       GFC_REAL_4 s;
2858
2859       for (y = 0; y < ycount; y++)
2860         {
2861           bbase_y = &bbase[y*bystride];
2862           s = (GFC_REAL_4) 0;
2863           for (n = 0; n < count; n++)
2864             s += abase[n*axstride] * bbase_y[n*bxstride];
2865           dest[y*rxstride] = s;
2866         }
2867     }
2868   else
2869     {
2870       const GFC_REAL_4 *restrict abase_x;
2871       const GFC_REAL_4 *restrict bbase_y;
2872       GFC_REAL_4 *restrict dest_y;
2873       GFC_REAL_4 s;
2874
2875       for (y = 0; y < ycount; y++)
2876         {
2877           bbase_y = &bbase[y*bystride];
2878           dest_y = &dest[y*rystride];
2879           for (x = 0; x < xcount; x++)
2880             {
2881               abase_x = &abase[x*axstride];
2882               s = (GFC_REAL_4) 0;
2883               for (n = 0; n < count; n++)
2884                 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2885               dest_y[x*rxstride] = s;
2886             }
2887         }
2888     }
2889 }
2890 #undef POW3
2891 #undef min
2892 #undef max
2893
2894 #endif
2895 #endif
2896