0996c69ad0479733512ba2b6038507a7abd21790
[gcc.git] / libgfortran / generated / matmul_r4.c
1 /* Implementation of the MATMUL intrinsic
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 Contributed by Paul Brook <paul@nowt.org>
4
5 This file is part of the GNU Fortran runtime library (libgfortran).
6
7 Libgfortran is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 Libgfortran is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 #include "libgfortran.h"
27 #include <string.h>
28 #include <assert.h>
29
30
31 #if defined (HAVE_GFC_REAL_4)
32
33 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
34 passed to us by the front-end, in which case we call it for large
35 matrices. */
36
37 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
38 const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
39 const int *, const GFC_REAL_4 *, const int *,
40 const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
41 int, int);
42
43 /* The order of loops is different in the case of plain matrix
44 multiplication C=MATMUL(A,B), and in the frequent special case where
45 the argument A is the temporary result of a TRANSPOSE intrinsic:
46 C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
47 looking at their strides.
48
49 The equivalent Fortran pseudo-code is:
50
51 DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
52 IF (.NOT.IS_TRANSPOSED(A)) THEN
53 C = 0
54 DO J=1,N
55 DO K=1,COUNT
56 DO I=1,M
57 C(I,J) = C(I,J)+A(I,K)*B(K,J)
58 ELSE
59 DO J=1,N
60 DO I=1,M
61 S = 0
62 DO K=1,COUNT
63 S = S+A(I,K)*B(K,J)
64 C(I,J) = S
65 ENDIF
66 */
67
68 /* If try_blas is set to a nonzero value, then the matmul function will
69 see if there is a way to perform the matrix multiplication by a call
70 to the BLAS gemm function. */
71
72 extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
73 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
74 int blas_limit, blas_call gemm);
75 export_proto(matmul_r4);
76
77 /* Put exhaustive list of possible architectures here here, ORed together. */
78
79 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
80
81 #ifdef HAVE_AVX
82 static void
83 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
84 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
85 int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
86 static void
87 matmul_r4_avx (gfc_array_r4 * const restrict retarray,
88 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
89 int blas_limit, blas_call gemm)
90 {
91 const GFC_REAL_4 * restrict abase;
92 const GFC_REAL_4 * restrict bbase;
93 GFC_REAL_4 * restrict dest;
94
95 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
96 index_type x, y, n, count, xcount, ycount;
97
98 assert (GFC_DESCRIPTOR_RANK (a) == 2
99 || GFC_DESCRIPTOR_RANK (b) == 2);
100
101 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
102
103 Either A or B (but not both) can be rank 1:
104
105 o One-dimensional argument A is implicitly treated as a row matrix
106 dimensioned [1,count], so xcount=1.
107
108 o One-dimensional argument B is implicitly treated as a column matrix
109 dimensioned [count, 1], so ycount=1.
110 */
111
112 if (retarray->base_addr == NULL)
113 {
114 if (GFC_DESCRIPTOR_RANK (a) == 1)
115 {
116 GFC_DIMENSION_SET(retarray->dim[0], 0,
117 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
118 }
119 else if (GFC_DESCRIPTOR_RANK (b) == 1)
120 {
121 GFC_DIMENSION_SET(retarray->dim[0], 0,
122 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
123 }
124 else
125 {
126 GFC_DIMENSION_SET(retarray->dim[0], 0,
127 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
128
129 GFC_DIMENSION_SET(retarray->dim[1], 0,
130 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
131 GFC_DESCRIPTOR_EXTENT(retarray,0));
132 }
133
134 retarray->base_addr
135 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
136 retarray->offset = 0;
137 }
138 else if (unlikely (compile_options.bounds_check))
139 {
140 index_type ret_extent, arg_extent;
141
142 if (GFC_DESCRIPTOR_RANK (a) == 1)
143 {
144 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
145 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
146 if (arg_extent != ret_extent)
147 runtime_error ("Incorrect extent in return array in"
148 " MATMUL intrinsic: is %ld, should be %ld",
149 (long int) ret_extent, (long int) arg_extent);
150 }
151 else if (GFC_DESCRIPTOR_RANK (b) == 1)
152 {
153 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
154 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
155 if (arg_extent != ret_extent)
156 runtime_error ("Incorrect extent in return array in"
157 " MATMUL intrinsic: is %ld, should be %ld",
158 (long int) ret_extent, (long int) arg_extent);
159 }
160 else
161 {
162 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
163 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
164 if (arg_extent != ret_extent)
165 runtime_error ("Incorrect extent in return array in"
166 " MATMUL intrinsic for dimension 1:"
167 " is %ld, should be %ld",
168 (long int) ret_extent, (long int) arg_extent);
169
170 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
171 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
172 if (arg_extent != ret_extent)
173 runtime_error ("Incorrect extent in return array in"
174 " MATMUL intrinsic for dimension 2:"
175 " is %ld, should be %ld",
176 (long int) ret_extent, (long int) arg_extent);
177 }
178 }
179
180
181 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
182 {
183 /* One-dimensional result may be addressed in the code below
184 either as a row or a column matrix. We want both cases to
185 work. */
186 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
187 }
188 else
189 {
190 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
191 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
192 }
193
194
195 if (GFC_DESCRIPTOR_RANK (a) == 1)
196 {
197 /* Treat it as a a row matrix A[1,count]. */
198 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
199 aystride = 1;
200
201 xcount = 1;
202 count = GFC_DESCRIPTOR_EXTENT(a,0);
203 }
204 else
205 {
206 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
207 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
208
209 count = GFC_DESCRIPTOR_EXTENT(a,1);
210 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
211 }
212
213 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
214 {
215 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
216 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
217 }
218
219 if (GFC_DESCRIPTOR_RANK (b) == 1)
220 {
221 /* Treat it as a column matrix B[count,1] */
222 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
223
224 /* bystride should never be used for 1-dimensional b.
225 in case it is we want it to cause a segfault, rather than
226 an incorrect result. */
227 bystride = 0xDEADBEEF;
228 ycount = 1;
229 }
230 else
231 {
232 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
233 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
234 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
235 }
236
237 abase = a->base_addr;
238 bbase = b->base_addr;
239 dest = retarray->base_addr;
240
241 /* Now that everything is set up, we perform the multiplication
242 itself. */
243
244 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
245 #define min(a,b) ((a) <= (b) ? (a) : (b))
246 #define max(a,b) ((a) >= (b) ? (a) : (b))
247
248 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
249 && (bxstride == 1 || bystride == 1)
250 && (((float) xcount) * ((float) ycount) * ((float) count)
251 > POW3(blas_limit)))
252 {
253 const int m = xcount, n = ycount, k = count, ldc = rystride;
254 const GFC_REAL_4 one = 1, zero = 0;
255 const int lda = (axstride == 1) ? aystride : axstride,
256 ldb = (bxstride == 1) ? bystride : bxstride;
257
258 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
259 {
260 assert (gemm != NULL);
261 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
262 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
263 &ldc, 1, 1);
264 return;
265 }
266 }
267
268 if (rxstride == 1 && axstride == 1 && bxstride == 1)
269 {
270 /* This block of code implements a tuned matmul, derived from
271 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
272
273 Bo Kagstrom and Per Ling
274 Department of Computing Science
275 Umea University
276 S-901 87 Umea, Sweden
277
278 from netlib.org, translated to C, and modified for matmul.m4. */
279
280 const GFC_REAL_4 *a, *b;
281 GFC_REAL_4 *c;
282 const index_type m = xcount, n = ycount, k = count;
283
284 /* System generated locals */
285 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
286 i1, i2, i3, i4, i5, i6;
287
288 /* Local variables */
289 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
290 f13, f14, f23, f24, f33, f34, f43, f44;
291 index_type i, j, l, ii, jj, ll;
292 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
293
294 a = abase;
295 b = bbase;
296 c = retarray->base_addr;
297
298 /* Parameter adjustments */
299 c_dim1 = rystride;
300 c_offset = 1 + c_dim1;
301 c -= c_offset;
302 a_dim1 = aystride;
303 a_offset = 1 + a_dim1;
304 a -= a_offset;
305 b_dim1 = bystride;
306 b_offset = 1 + b_dim1;
307 b -= b_offset;
308
309 /* Early exit if possible */
310 if (m == 0 || n == 0 || k == 0)
311 return;
312
313 /* Adjust size of t1 to what is needed. */
314 index_type t1_dim;
315 t1_dim = (a_dim1-1) * 256 + b_dim1;
316 if (t1_dim > 65536)
317 t1_dim = 65536;
318
319 #pragma GCC diagnostic push
320 #pragma GCC diagnostic ignored "-Wvla"
321 GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
322 #pragma GCC diagnostic pop
323
324 /* Empty c first. */
325 for (j=1; j<=n; j++)
326 for (i=1; i<=m; i++)
327 c[i + j * c_dim1] = (GFC_REAL_4)0;
328
329 /* Start turning the crank. */
330 i1 = n;
331 for (jj = 1; jj <= i1; jj += 512)
332 {
333 /* Computing MIN */
334 i2 = 512;
335 i3 = n - jj + 1;
336 jsec = min(i2,i3);
337 ujsec = jsec - jsec % 4;
338 i2 = k;
339 for (ll = 1; ll <= i2; ll += 256)
340 {
341 /* Computing MIN */
342 i3 = 256;
343 i4 = k - ll + 1;
344 lsec = min(i3,i4);
345 ulsec = lsec - lsec % 2;
346
347 i3 = m;
348 for (ii = 1; ii <= i3; ii += 256)
349 {
350 /* Computing MIN */
351 i4 = 256;
352 i5 = m - ii + 1;
353 isec = min(i4,i5);
354 uisec = isec - isec % 2;
355 i4 = ll + ulsec - 1;
356 for (l = ll; l <= i4; l += 2)
357 {
358 i5 = ii + uisec - 1;
359 for (i = ii; i <= i5; i += 2)
360 {
361 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
362 a[i + l * a_dim1];
363 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
364 a[i + (l + 1) * a_dim1];
365 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
366 a[i + 1 + l * a_dim1];
367 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
368 a[i + 1 + (l + 1) * a_dim1];
369 }
370 if (uisec < isec)
371 {
372 t1[l - ll + 1 + (isec << 8) - 257] =
373 a[ii + isec - 1 + l * a_dim1];
374 t1[l - ll + 2 + (isec << 8) - 257] =
375 a[ii + isec - 1 + (l + 1) * a_dim1];
376 }
377 }
378 if (ulsec < lsec)
379 {
380 i4 = ii + isec - 1;
381 for (i = ii; i<= i4; ++i)
382 {
383 t1[lsec + ((i - ii + 1) << 8) - 257] =
384 a[i + (ll + lsec - 1) * a_dim1];
385 }
386 }
387
388 uisec = isec - isec % 4;
389 i4 = jj + ujsec - 1;
390 for (j = jj; j <= i4; j += 4)
391 {
392 i5 = ii + uisec - 1;
393 for (i = ii; i <= i5; i += 4)
394 {
395 f11 = c[i + j * c_dim1];
396 f21 = c[i + 1 + j * c_dim1];
397 f12 = c[i + (j + 1) * c_dim1];
398 f22 = c[i + 1 + (j + 1) * c_dim1];
399 f13 = c[i + (j + 2) * c_dim1];
400 f23 = c[i + 1 + (j + 2) * c_dim1];
401 f14 = c[i + (j + 3) * c_dim1];
402 f24 = c[i + 1 + (j + 3) * c_dim1];
403 f31 = c[i + 2 + j * c_dim1];
404 f41 = c[i + 3 + j * c_dim1];
405 f32 = c[i + 2 + (j + 1) * c_dim1];
406 f42 = c[i + 3 + (j + 1) * c_dim1];
407 f33 = c[i + 2 + (j + 2) * c_dim1];
408 f43 = c[i + 3 + (j + 2) * c_dim1];
409 f34 = c[i + 2 + (j + 3) * c_dim1];
410 f44 = c[i + 3 + (j + 3) * c_dim1];
411 i6 = ll + lsec - 1;
412 for (l = ll; l <= i6; ++l)
413 {
414 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
415 * b[l + j * b_dim1];
416 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
417 * b[l + j * b_dim1];
418 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
419 * b[l + (j + 1) * b_dim1];
420 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
421 * b[l + (j + 1) * b_dim1];
422 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
423 * b[l + (j + 2) * b_dim1];
424 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
425 * b[l + (j + 2) * b_dim1];
426 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
427 * b[l + (j + 3) * b_dim1];
428 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
429 * b[l + (j + 3) * b_dim1];
430 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
431 * b[l + j * b_dim1];
432 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
433 * b[l + j * b_dim1];
434 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
435 * b[l + (j + 1) * b_dim1];
436 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
437 * b[l + (j + 1) * b_dim1];
438 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
439 * b[l + (j + 2) * b_dim1];
440 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
441 * b[l + (j + 2) * b_dim1];
442 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
443 * b[l + (j + 3) * b_dim1];
444 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
445 * b[l + (j + 3) * b_dim1];
446 }
447 c[i + j * c_dim1] = f11;
448 c[i + 1 + j * c_dim1] = f21;
449 c[i + (j + 1) * c_dim1] = f12;
450 c[i + 1 + (j + 1) * c_dim1] = f22;
451 c[i + (j + 2) * c_dim1] = f13;
452 c[i + 1 + (j + 2) * c_dim1] = f23;
453 c[i + (j + 3) * c_dim1] = f14;
454 c[i + 1 + (j + 3) * c_dim1] = f24;
455 c[i + 2 + j * c_dim1] = f31;
456 c[i + 3 + j * c_dim1] = f41;
457 c[i + 2 + (j + 1) * c_dim1] = f32;
458 c[i + 3 + (j + 1) * c_dim1] = f42;
459 c[i + 2 + (j + 2) * c_dim1] = f33;
460 c[i + 3 + (j + 2) * c_dim1] = f43;
461 c[i + 2 + (j + 3) * c_dim1] = f34;
462 c[i + 3 + (j + 3) * c_dim1] = f44;
463 }
464 if (uisec < isec)
465 {
466 i5 = ii + isec - 1;
467 for (i = ii + uisec; i <= i5; ++i)
468 {
469 f11 = c[i + j * c_dim1];
470 f12 = c[i + (j + 1) * c_dim1];
471 f13 = c[i + (j + 2) * c_dim1];
472 f14 = c[i + (j + 3) * c_dim1];
473 i6 = ll + lsec - 1;
474 for (l = ll; l <= i6; ++l)
475 {
476 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
477 257] * b[l + j * b_dim1];
478 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
479 257] * b[l + (j + 1) * b_dim1];
480 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
481 257] * b[l + (j + 2) * b_dim1];
482 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
483 257] * b[l + (j + 3) * b_dim1];
484 }
485 c[i + j * c_dim1] = f11;
486 c[i + (j + 1) * c_dim1] = f12;
487 c[i + (j + 2) * c_dim1] = f13;
488 c[i + (j + 3) * c_dim1] = f14;
489 }
490 }
491 }
492 if (ujsec < jsec)
493 {
494 i4 = jj + jsec - 1;
495 for (j = jj + ujsec; j <= i4; ++j)
496 {
497 i5 = ii + uisec - 1;
498 for (i = ii; i <= i5; i += 4)
499 {
500 f11 = c[i + j * c_dim1];
501 f21 = c[i + 1 + j * c_dim1];
502 f31 = c[i + 2 + j * c_dim1];
503 f41 = c[i + 3 + j * c_dim1];
504 i6 = ll + lsec - 1;
505 for (l = ll; l <= i6; ++l)
506 {
507 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
508 257] * b[l + j * b_dim1];
509 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
510 257] * b[l + j * b_dim1];
511 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
512 257] * b[l + j * b_dim1];
513 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
514 257] * b[l + j * b_dim1];
515 }
516 c[i + j * c_dim1] = f11;
517 c[i + 1 + j * c_dim1] = f21;
518 c[i + 2 + j * c_dim1] = f31;
519 c[i + 3 + j * c_dim1] = f41;
520 }
521 i5 = ii + isec - 1;
522 for (i = ii + uisec; i <= i5; ++i)
523 {
524 f11 = c[i + j * c_dim1];
525 i6 = ll + lsec - 1;
526 for (l = ll; l <= i6; ++l)
527 {
528 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
529 257] * b[l + j * b_dim1];
530 }
531 c[i + j * c_dim1] = f11;
532 }
533 }
534 }
535 }
536 }
537 }
538 return;
539 }
540 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
541 {
542 if (GFC_DESCRIPTOR_RANK (a) != 1)
543 {
544 const GFC_REAL_4 *restrict abase_x;
545 const GFC_REAL_4 *restrict bbase_y;
546 GFC_REAL_4 *restrict dest_y;
547 GFC_REAL_4 s;
548
549 for (y = 0; y < ycount; y++)
550 {
551 bbase_y = &bbase[y*bystride];
552 dest_y = &dest[y*rystride];
553 for (x = 0; x < xcount; x++)
554 {
555 abase_x = &abase[x*axstride];
556 s = (GFC_REAL_4) 0;
557 for (n = 0; n < count; n++)
558 s += abase_x[n] * bbase_y[n];
559 dest_y[x] = s;
560 }
561 }
562 }
563 else
564 {
565 const GFC_REAL_4 *restrict bbase_y;
566 GFC_REAL_4 s;
567
568 for (y = 0; y < ycount; y++)
569 {
570 bbase_y = &bbase[y*bystride];
571 s = (GFC_REAL_4) 0;
572 for (n = 0; n < count; n++)
573 s += abase[n*axstride] * bbase_y[n];
574 dest[y*rystride] = s;
575 }
576 }
577 }
578 else if (axstride < aystride)
579 {
580 for (y = 0; y < ycount; y++)
581 for (x = 0; x < xcount; x++)
582 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
583
584 for (y = 0; y < ycount; y++)
585 for (n = 0; n < count; n++)
586 for (x = 0; x < xcount; x++)
587 /* dest[x,y] += a[x,n] * b[n,y] */
588 dest[x*rxstride + y*rystride] +=
589 abase[x*axstride + n*aystride] *
590 bbase[n*bxstride + y*bystride];
591 }
592 else if (GFC_DESCRIPTOR_RANK (a) == 1)
593 {
594 const GFC_REAL_4 *restrict bbase_y;
595 GFC_REAL_4 s;
596
597 for (y = 0; y < ycount; y++)
598 {
599 bbase_y = &bbase[y*bystride];
600 s = (GFC_REAL_4) 0;
601 for (n = 0; n < count; n++)
602 s += abase[n*axstride] * bbase_y[n*bxstride];
603 dest[y*rxstride] = s;
604 }
605 }
606 else
607 {
608 const GFC_REAL_4 *restrict abase_x;
609 const GFC_REAL_4 *restrict bbase_y;
610 GFC_REAL_4 *restrict dest_y;
611 GFC_REAL_4 s;
612
613 for (y = 0; y < ycount; y++)
614 {
615 bbase_y = &bbase[y*bystride];
616 dest_y = &dest[y*rystride];
617 for (x = 0; x < xcount; x++)
618 {
619 abase_x = &abase[x*axstride];
620 s = (GFC_REAL_4) 0;
621 for (n = 0; n < count; n++)
622 s += abase_x[n*aystride] * bbase_y[n*bxstride];
623 dest_y[x*rxstride] = s;
624 }
625 }
626 }
627 }
628 #undef POW3
629 #undef min
630 #undef max
631
632 #endif /* HAVE_AVX */
633
634 #ifdef HAVE_AVX2
635 static void
636 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
637 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
638 int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
639 static void
640 matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
641 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
642 int blas_limit, blas_call gemm)
643 {
644 const GFC_REAL_4 * restrict abase;
645 const GFC_REAL_4 * restrict bbase;
646 GFC_REAL_4 * restrict dest;
647
648 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
649 index_type x, y, n, count, xcount, ycount;
650
651 assert (GFC_DESCRIPTOR_RANK (a) == 2
652 || GFC_DESCRIPTOR_RANK (b) == 2);
653
654 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
655
656 Either A or B (but not both) can be rank 1:
657
658 o One-dimensional argument A is implicitly treated as a row matrix
659 dimensioned [1,count], so xcount=1.
660
661 o One-dimensional argument B is implicitly treated as a column matrix
662 dimensioned [count, 1], so ycount=1.
663 */
664
665 if (retarray->base_addr == NULL)
666 {
667 if (GFC_DESCRIPTOR_RANK (a) == 1)
668 {
669 GFC_DIMENSION_SET(retarray->dim[0], 0,
670 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
671 }
672 else if (GFC_DESCRIPTOR_RANK (b) == 1)
673 {
674 GFC_DIMENSION_SET(retarray->dim[0], 0,
675 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
676 }
677 else
678 {
679 GFC_DIMENSION_SET(retarray->dim[0], 0,
680 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
681
682 GFC_DIMENSION_SET(retarray->dim[1], 0,
683 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
684 GFC_DESCRIPTOR_EXTENT(retarray,0));
685 }
686
687 retarray->base_addr
688 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
689 retarray->offset = 0;
690 }
691 else if (unlikely (compile_options.bounds_check))
692 {
693 index_type ret_extent, arg_extent;
694
695 if (GFC_DESCRIPTOR_RANK (a) == 1)
696 {
697 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
698 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
699 if (arg_extent != ret_extent)
700 runtime_error ("Incorrect extent in return array in"
701 " MATMUL intrinsic: is %ld, should be %ld",
702 (long int) ret_extent, (long int) arg_extent);
703 }
704 else if (GFC_DESCRIPTOR_RANK (b) == 1)
705 {
706 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
707 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
708 if (arg_extent != ret_extent)
709 runtime_error ("Incorrect extent in return array in"
710 " MATMUL intrinsic: is %ld, should be %ld",
711 (long int) ret_extent, (long int) arg_extent);
712 }
713 else
714 {
715 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
716 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
717 if (arg_extent != ret_extent)
718 runtime_error ("Incorrect extent in return array in"
719 " MATMUL intrinsic for dimension 1:"
720 " is %ld, should be %ld",
721 (long int) ret_extent, (long int) arg_extent);
722
723 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
724 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
725 if (arg_extent != ret_extent)
726 runtime_error ("Incorrect extent in return array in"
727 " MATMUL intrinsic for dimension 2:"
728 " is %ld, should be %ld",
729 (long int) ret_extent, (long int) arg_extent);
730 }
731 }
732
733
734 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
735 {
736 /* One-dimensional result may be addressed in the code below
737 either as a row or a column matrix. We want both cases to
738 work. */
739 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
740 }
741 else
742 {
743 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
744 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
745 }
746
747
748 if (GFC_DESCRIPTOR_RANK (a) == 1)
749 {
750 /* Treat it as a a row matrix A[1,count]. */
751 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
752 aystride = 1;
753
754 xcount = 1;
755 count = GFC_DESCRIPTOR_EXTENT(a,0);
756 }
757 else
758 {
759 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
760 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
761
762 count = GFC_DESCRIPTOR_EXTENT(a,1);
763 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
764 }
765
766 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
767 {
768 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
769 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
770 }
771
772 if (GFC_DESCRIPTOR_RANK (b) == 1)
773 {
774 /* Treat it as a column matrix B[count,1] */
775 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
776
777 /* bystride should never be used for 1-dimensional b.
778 in case it is we want it to cause a segfault, rather than
779 an incorrect result. */
780 bystride = 0xDEADBEEF;
781 ycount = 1;
782 }
783 else
784 {
785 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
786 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
787 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
788 }
789
790 abase = a->base_addr;
791 bbase = b->base_addr;
792 dest = retarray->base_addr;
793
794 /* Now that everything is set up, we perform the multiplication
795 itself. */
796
797 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
798 #define min(a,b) ((a) <= (b) ? (a) : (b))
799 #define max(a,b) ((a) >= (b) ? (a) : (b))
800
801 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
802 && (bxstride == 1 || bystride == 1)
803 && (((float) xcount) * ((float) ycount) * ((float) count)
804 > POW3(blas_limit)))
805 {
806 const int m = xcount, n = ycount, k = count, ldc = rystride;
807 const GFC_REAL_4 one = 1, zero = 0;
808 const int lda = (axstride == 1) ? aystride : axstride,
809 ldb = (bxstride == 1) ? bystride : bxstride;
810
811 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
812 {
813 assert (gemm != NULL);
814 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
815 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
816 &ldc, 1, 1);
817 return;
818 }
819 }
820
821 if (rxstride == 1 && axstride == 1 && bxstride == 1)
822 {
823 /* This block of code implements a tuned matmul, derived from
824 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
825
826 Bo Kagstrom and Per Ling
827 Department of Computing Science
828 Umea University
829 S-901 87 Umea, Sweden
830
831 from netlib.org, translated to C, and modified for matmul.m4. */
832
833 const GFC_REAL_4 *a, *b;
834 GFC_REAL_4 *c;
835 const index_type m = xcount, n = ycount, k = count;
836
837 /* System generated locals */
838 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
839 i1, i2, i3, i4, i5, i6;
840
841 /* Local variables */
842 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
843 f13, f14, f23, f24, f33, f34, f43, f44;
844 index_type i, j, l, ii, jj, ll;
845 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
846
847 a = abase;
848 b = bbase;
849 c = retarray->base_addr;
850
851 /* Parameter adjustments */
852 c_dim1 = rystride;
853 c_offset = 1 + c_dim1;
854 c -= c_offset;
855 a_dim1 = aystride;
856 a_offset = 1 + a_dim1;
857 a -= a_offset;
858 b_dim1 = bystride;
859 b_offset = 1 + b_dim1;
860 b -= b_offset;
861
862 /* Early exit if possible */
863 if (m == 0 || n == 0 || k == 0)
864 return;
865
866 /* Adjust size of t1 to what is needed. */
867 index_type t1_dim;
868 t1_dim = (a_dim1-1) * 256 + b_dim1;
869 if (t1_dim > 65536)
870 t1_dim = 65536;
871
872 #pragma GCC diagnostic push
873 #pragma GCC diagnostic ignored "-Wvla"
874 GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
875 #pragma GCC diagnostic pop
876
877 /* Empty c first. */
878 for (j=1; j<=n; j++)
879 for (i=1; i<=m; i++)
880 c[i + j * c_dim1] = (GFC_REAL_4)0;
881
882 /* Start turning the crank. */
883 i1 = n;
884 for (jj = 1; jj <= i1; jj += 512)
885 {
886 /* Computing MIN */
887 i2 = 512;
888 i3 = n - jj + 1;
889 jsec = min(i2,i3);
890 ujsec = jsec - jsec % 4;
891 i2 = k;
892 for (ll = 1; ll <= i2; ll += 256)
893 {
894 /* Computing MIN */
895 i3 = 256;
896 i4 = k - ll + 1;
897 lsec = min(i3,i4);
898 ulsec = lsec - lsec % 2;
899
900 i3 = m;
901 for (ii = 1; ii <= i3; ii += 256)
902 {
903 /* Computing MIN */
904 i4 = 256;
905 i5 = m - ii + 1;
906 isec = min(i4,i5);
907 uisec = isec - isec % 2;
908 i4 = ll + ulsec - 1;
909 for (l = ll; l <= i4; l += 2)
910 {
911 i5 = ii + uisec - 1;
912 for (i = ii; i <= i5; i += 2)
913 {
914 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
915 a[i + l * a_dim1];
916 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
917 a[i + (l + 1) * a_dim1];
918 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
919 a[i + 1 + l * a_dim1];
920 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
921 a[i + 1 + (l + 1) * a_dim1];
922 }
923 if (uisec < isec)
924 {
925 t1[l - ll + 1 + (isec << 8) - 257] =
926 a[ii + isec - 1 + l * a_dim1];
927 t1[l - ll + 2 + (isec << 8) - 257] =
928 a[ii + isec - 1 + (l + 1) * a_dim1];
929 }
930 }
931 if (ulsec < lsec)
932 {
933 i4 = ii + isec - 1;
934 for (i = ii; i<= i4; ++i)
935 {
936 t1[lsec + ((i - ii + 1) << 8) - 257] =
937 a[i + (ll + lsec - 1) * a_dim1];
938 }
939 }
940
941 uisec = isec - isec % 4;
942 i4 = jj + ujsec - 1;
943 for (j = jj; j <= i4; j += 4)
944 {
945 i5 = ii + uisec - 1;
946 for (i = ii; i <= i5; i += 4)
947 {
948 f11 = c[i + j * c_dim1];
949 f21 = c[i + 1 + j * c_dim1];
950 f12 = c[i + (j + 1) * c_dim1];
951 f22 = c[i + 1 + (j + 1) * c_dim1];
952 f13 = c[i + (j + 2) * c_dim1];
953 f23 = c[i + 1 + (j + 2) * c_dim1];
954 f14 = c[i + (j + 3) * c_dim1];
955 f24 = c[i + 1 + (j + 3) * c_dim1];
956 f31 = c[i + 2 + j * c_dim1];
957 f41 = c[i + 3 + j * c_dim1];
958 f32 = c[i + 2 + (j + 1) * c_dim1];
959 f42 = c[i + 3 + (j + 1) * c_dim1];
960 f33 = c[i + 2 + (j + 2) * c_dim1];
961 f43 = c[i + 3 + (j + 2) * c_dim1];
962 f34 = c[i + 2 + (j + 3) * c_dim1];
963 f44 = c[i + 3 + (j + 3) * c_dim1];
964 i6 = ll + lsec - 1;
965 for (l = ll; l <= i6; ++l)
966 {
967 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
968 * b[l + j * b_dim1];
969 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
970 * b[l + j * b_dim1];
971 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
972 * b[l + (j + 1) * b_dim1];
973 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
974 * b[l + (j + 1) * b_dim1];
975 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
976 * b[l + (j + 2) * b_dim1];
977 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
978 * b[l + (j + 2) * b_dim1];
979 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
980 * b[l + (j + 3) * b_dim1];
981 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
982 * b[l + (j + 3) * b_dim1];
983 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
984 * b[l + j * b_dim1];
985 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
986 * b[l + j * b_dim1];
987 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
988 * b[l + (j + 1) * b_dim1];
989 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
990 * b[l + (j + 1) * b_dim1];
991 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
992 * b[l + (j + 2) * b_dim1];
993 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
994 * b[l + (j + 2) * b_dim1];
995 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
996 * b[l + (j + 3) * b_dim1];
997 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
998 * b[l + (j + 3) * b_dim1];
999 }
1000 c[i + j * c_dim1] = f11;
1001 c[i + 1 + j * c_dim1] = f21;
1002 c[i + (j + 1) * c_dim1] = f12;
1003 c[i + 1 + (j + 1) * c_dim1] = f22;
1004 c[i + (j + 2) * c_dim1] = f13;
1005 c[i + 1 + (j + 2) * c_dim1] = f23;
1006 c[i + (j + 3) * c_dim1] = f14;
1007 c[i + 1 + (j + 3) * c_dim1] = f24;
1008 c[i + 2 + j * c_dim1] = f31;
1009 c[i + 3 + j * c_dim1] = f41;
1010 c[i + 2 + (j + 1) * c_dim1] = f32;
1011 c[i + 3 + (j + 1) * c_dim1] = f42;
1012 c[i + 2 + (j + 2) * c_dim1] = f33;
1013 c[i + 3 + (j + 2) * c_dim1] = f43;
1014 c[i + 2 + (j + 3) * c_dim1] = f34;
1015 c[i + 3 + (j + 3) * c_dim1] = f44;
1016 }
1017 if (uisec < isec)
1018 {
1019 i5 = ii + isec - 1;
1020 for (i = ii + uisec; i <= i5; ++i)
1021 {
1022 f11 = c[i + j * c_dim1];
1023 f12 = c[i + (j + 1) * c_dim1];
1024 f13 = c[i + (j + 2) * c_dim1];
1025 f14 = c[i + (j + 3) * c_dim1];
1026 i6 = ll + lsec - 1;
1027 for (l = ll; l <= i6; ++l)
1028 {
1029 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1030 257] * b[l + j * b_dim1];
1031 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1032 257] * b[l + (j + 1) * b_dim1];
1033 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1034 257] * b[l + (j + 2) * b_dim1];
1035 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1036 257] * b[l + (j + 3) * b_dim1];
1037 }
1038 c[i + j * c_dim1] = f11;
1039 c[i + (j + 1) * c_dim1] = f12;
1040 c[i + (j + 2) * c_dim1] = f13;
1041 c[i + (j + 3) * c_dim1] = f14;
1042 }
1043 }
1044 }
1045 if (ujsec < jsec)
1046 {
1047 i4 = jj + jsec - 1;
1048 for (j = jj + ujsec; j <= i4; ++j)
1049 {
1050 i5 = ii + uisec - 1;
1051 for (i = ii; i <= i5; i += 4)
1052 {
1053 f11 = c[i + j * c_dim1];
1054 f21 = c[i + 1 + j * c_dim1];
1055 f31 = c[i + 2 + j * c_dim1];
1056 f41 = c[i + 3 + j * c_dim1];
1057 i6 = ll + lsec - 1;
1058 for (l = ll; l <= i6; ++l)
1059 {
1060 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1061 257] * b[l + j * b_dim1];
1062 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1063 257] * b[l + j * b_dim1];
1064 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1065 257] * b[l + j * b_dim1];
1066 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1067 257] * b[l + j * b_dim1];
1068 }
1069 c[i + j * c_dim1] = f11;
1070 c[i + 1 + j * c_dim1] = f21;
1071 c[i + 2 + j * c_dim1] = f31;
1072 c[i + 3 + j * c_dim1] = f41;
1073 }
1074 i5 = ii + isec - 1;
1075 for (i = ii + uisec; i <= i5; ++i)
1076 {
1077 f11 = c[i + j * c_dim1];
1078 i6 = ll + lsec - 1;
1079 for (l = ll; l <= i6; ++l)
1080 {
1081 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1082 257] * b[l + j * b_dim1];
1083 }
1084 c[i + j * c_dim1] = f11;
1085 }
1086 }
1087 }
1088 }
1089 }
1090 }
1091 return;
1092 }
1093 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1094 {
1095 if (GFC_DESCRIPTOR_RANK (a) != 1)
1096 {
1097 const GFC_REAL_4 *restrict abase_x;
1098 const GFC_REAL_4 *restrict bbase_y;
1099 GFC_REAL_4 *restrict dest_y;
1100 GFC_REAL_4 s;
1101
1102 for (y = 0; y < ycount; y++)
1103 {
1104 bbase_y = &bbase[y*bystride];
1105 dest_y = &dest[y*rystride];
1106 for (x = 0; x < xcount; x++)
1107 {
1108 abase_x = &abase[x*axstride];
1109 s = (GFC_REAL_4) 0;
1110 for (n = 0; n < count; n++)
1111 s += abase_x[n] * bbase_y[n];
1112 dest_y[x] = s;
1113 }
1114 }
1115 }
1116 else
1117 {
1118 const GFC_REAL_4 *restrict bbase_y;
1119 GFC_REAL_4 s;
1120
1121 for (y = 0; y < ycount; y++)
1122 {
1123 bbase_y = &bbase[y*bystride];
1124 s = (GFC_REAL_4) 0;
1125 for (n = 0; n < count; n++)
1126 s += abase[n*axstride] * bbase_y[n];
1127 dest[y*rystride] = s;
1128 }
1129 }
1130 }
1131 else if (axstride < aystride)
1132 {
1133 for (y = 0; y < ycount; y++)
1134 for (x = 0; x < xcount; x++)
1135 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1136
1137 for (y = 0; y < ycount; y++)
1138 for (n = 0; n < count; n++)
1139 for (x = 0; x < xcount; x++)
1140 /* dest[x,y] += a[x,n] * b[n,y] */
1141 dest[x*rxstride + y*rystride] +=
1142 abase[x*axstride + n*aystride] *
1143 bbase[n*bxstride + y*bystride];
1144 }
1145 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1146 {
1147 const GFC_REAL_4 *restrict bbase_y;
1148 GFC_REAL_4 s;
1149
1150 for (y = 0; y < ycount; y++)
1151 {
1152 bbase_y = &bbase[y*bystride];
1153 s = (GFC_REAL_4) 0;
1154 for (n = 0; n < count; n++)
1155 s += abase[n*axstride] * bbase_y[n*bxstride];
1156 dest[y*rxstride] = s;
1157 }
1158 }
1159 else
1160 {
1161 const GFC_REAL_4 *restrict abase_x;
1162 const GFC_REAL_4 *restrict bbase_y;
1163 GFC_REAL_4 *restrict dest_y;
1164 GFC_REAL_4 s;
1165
1166 for (y = 0; y < ycount; y++)
1167 {
1168 bbase_y = &bbase[y*bystride];
1169 dest_y = &dest[y*rystride];
1170 for (x = 0; x < xcount; x++)
1171 {
1172 abase_x = &abase[x*axstride];
1173 s = (GFC_REAL_4) 0;
1174 for (n = 0; n < count; n++)
1175 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1176 dest_y[x*rxstride] = s;
1177 }
1178 }
1179 }
1180 }
1181 #undef POW3
1182 #undef min
1183 #undef max
1184
1185 #endif /* HAVE_AVX2 */
1186
1187 #ifdef HAVE_AVX512F
1188 static void
1189 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1190 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1191 int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
1192 static void
1193 matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
1194 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1195 int blas_limit, blas_call gemm)
1196 {
1197 const GFC_REAL_4 * restrict abase;
1198 const GFC_REAL_4 * restrict bbase;
1199 GFC_REAL_4 * restrict dest;
1200
1201 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1202 index_type x, y, n, count, xcount, ycount;
1203
1204 assert (GFC_DESCRIPTOR_RANK (a) == 2
1205 || GFC_DESCRIPTOR_RANK (b) == 2);
1206
1207 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1208
1209 Either A or B (but not both) can be rank 1:
1210
1211 o One-dimensional argument A is implicitly treated as a row matrix
1212 dimensioned [1,count], so xcount=1.
1213
1214 o One-dimensional argument B is implicitly treated as a column matrix
1215 dimensioned [count, 1], so ycount=1.
1216 */
1217
1218 if (retarray->base_addr == NULL)
1219 {
1220 if (GFC_DESCRIPTOR_RANK (a) == 1)
1221 {
1222 GFC_DIMENSION_SET(retarray->dim[0], 0,
1223 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1224 }
1225 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1226 {
1227 GFC_DIMENSION_SET(retarray->dim[0], 0,
1228 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1229 }
1230 else
1231 {
1232 GFC_DIMENSION_SET(retarray->dim[0], 0,
1233 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1234
1235 GFC_DIMENSION_SET(retarray->dim[1], 0,
1236 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1237 GFC_DESCRIPTOR_EXTENT(retarray,0));
1238 }
1239
1240 retarray->base_addr
1241 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1242 retarray->offset = 0;
1243 }
1244 else if (unlikely (compile_options.bounds_check))
1245 {
1246 index_type ret_extent, arg_extent;
1247
1248 if (GFC_DESCRIPTOR_RANK (a) == 1)
1249 {
1250 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1251 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1252 if (arg_extent != ret_extent)
1253 runtime_error ("Incorrect extent in return array in"
1254 " MATMUL intrinsic: is %ld, should be %ld",
1255 (long int) ret_extent, (long int) arg_extent);
1256 }
1257 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1258 {
1259 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1260 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1261 if (arg_extent != ret_extent)
1262 runtime_error ("Incorrect extent in return array in"
1263 " MATMUL intrinsic: is %ld, should be %ld",
1264 (long int) ret_extent, (long int) arg_extent);
1265 }
1266 else
1267 {
1268 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1269 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1270 if (arg_extent != ret_extent)
1271 runtime_error ("Incorrect extent in return array in"
1272 " MATMUL intrinsic for dimension 1:"
1273 " is %ld, should be %ld",
1274 (long int) ret_extent, (long int) arg_extent);
1275
1276 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1277 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1278 if (arg_extent != ret_extent)
1279 runtime_error ("Incorrect extent in return array in"
1280 " MATMUL intrinsic for dimension 2:"
1281 " is %ld, should be %ld",
1282 (long int) ret_extent, (long int) arg_extent);
1283 }
1284 }
1285
1286
1287 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1288 {
1289 /* One-dimensional result may be addressed in the code below
1290 either as a row or a column matrix. We want both cases to
1291 work. */
1292 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1293 }
1294 else
1295 {
1296 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1297 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1298 }
1299
1300
1301 if (GFC_DESCRIPTOR_RANK (a) == 1)
1302 {
1303 /* Treat it as a a row matrix A[1,count]. */
1304 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1305 aystride = 1;
1306
1307 xcount = 1;
1308 count = GFC_DESCRIPTOR_EXTENT(a,0);
1309 }
1310 else
1311 {
1312 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1313 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1314
1315 count = GFC_DESCRIPTOR_EXTENT(a,1);
1316 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1317 }
1318
1319 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1320 {
1321 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1322 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1323 }
1324
1325 if (GFC_DESCRIPTOR_RANK (b) == 1)
1326 {
1327 /* Treat it as a column matrix B[count,1] */
1328 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1329
1330 /* bystride should never be used for 1-dimensional b.
1331 in case it is we want it to cause a segfault, rather than
1332 an incorrect result. */
1333 bystride = 0xDEADBEEF;
1334 ycount = 1;
1335 }
1336 else
1337 {
1338 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1339 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1340 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1341 }
1342
1343 abase = a->base_addr;
1344 bbase = b->base_addr;
1345 dest = retarray->base_addr;
1346
1347 /* Now that everything is set up, we perform the multiplication
1348 itself. */
1349
1350 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1351 #define min(a,b) ((a) <= (b) ? (a) : (b))
1352 #define max(a,b) ((a) >= (b) ? (a) : (b))
1353
1354 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1355 && (bxstride == 1 || bystride == 1)
1356 && (((float) xcount) * ((float) ycount) * ((float) count)
1357 > POW3(blas_limit)))
1358 {
1359 const int m = xcount, n = ycount, k = count, ldc = rystride;
1360 const GFC_REAL_4 one = 1, zero = 0;
1361 const int lda = (axstride == 1) ? aystride : axstride,
1362 ldb = (bxstride == 1) ? bystride : bxstride;
1363
1364 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1365 {
1366 assert (gemm != NULL);
1367 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1368 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1369 &ldc, 1, 1);
1370 return;
1371 }
1372 }
1373
1374 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1375 {
1376 /* This block of code implements a tuned matmul, derived from
1377 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1378
1379 Bo Kagstrom and Per Ling
1380 Department of Computing Science
1381 Umea University
1382 S-901 87 Umea, Sweden
1383
1384 from netlib.org, translated to C, and modified for matmul.m4. */
1385
1386 const GFC_REAL_4 *a, *b;
1387 GFC_REAL_4 *c;
1388 const index_type m = xcount, n = ycount, k = count;
1389
1390 /* System generated locals */
1391 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1392 i1, i2, i3, i4, i5, i6;
1393
1394 /* Local variables */
1395 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
1396 f13, f14, f23, f24, f33, f34, f43, f44;
1397 index_type i, j, l, ii, jj, ll;
1398 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1399
1400 a = abase;
1401 b = bbase;
1402 c = retarray->base_addr;
1403
1404 /* Parameter adjustments */
1405 c_dim1 = rystride;
1406 c_offset = 1 + c_dim1;
1407 c -= c_offset;
1408 a_dim1 = aystride;
1409 a_offset = 1 + a_dim1;
1410 a -= a_offset;
1411 b_dim1 = bystride;
1412 b_offset = 1 + b_dim1;
1413 b -= b_offset;
1414
1415 /* Early exit if possible */
1416 if (m == 0 || n == 0 || k == 0)
1417 return;
1418
1419 /* Adjust size of t1 to what is needed. */
1420 index_type t1_dim;
1421 t1_dim = (a_dim1-1) * 256 + b_dim1;
1422 if (t1_dim > 65536)
1423 t1_dim = 65536;
1424
1425 #pragma GCC diagnostic push
1426 #pragma GCC diagnostic ignored "-Wvla"
1427 GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
1428 #pragma GCC diagnostic pop
1429
1430 /* Empty c first. */
1431 for (j=1; j<=n; j++)
1432 for (i=1; i<=m; i++)
1433 c[i + j * c_dim1] = (GFC_REAL_4)0;
1434
1435 /* Start turning the crank. */
1436 i1 = n;
1437 for (jj = 1; jj <= i1; jj += 512)
1438 {
1439 /* Computing MIN */
1440 i2 = 512;
1441 i3 = n - jj + 1;
1442 jsec = min(i2,i3);
1443 ujsec = jsec - jsec % 4;
1444 i2 = k;
1445 for (ll = 1; ll <= i2; ll += 256)
1446 {
1447 /* Computing MIN */
1448 i3 = 256;
1449 i4 = k - ll + 1;
1450 lsec = min(i3,i4);
1451 ulsec = lsec - lsec % 2;
1452
1453 i3 = m;
1454 for (ii = 1; ii <= i3; ii += 256)
1455 {
1456 /* Computing MIN */
1457 i4 = 256;
1458 i5 = m - ii + 1;
1459 isec = min(i4,i5);
1460 uisec = isec - isec % 2;
1461 i4 = ll + ulsec - 1;
1462 for (l = ll; l <= i4; l += 2)
1463 {
1464 i5 = ii + uisec - 1;
1465 for (i = ii; i <= i5; i += 2)
1466 {
1467 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
1468 a[i + l * a_dim1];
1469 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
1470 a[i + (l + 1) * a_dim1];
1471 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
1472 a[i + 1 + l * a_dim1];
1473 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
1474 a[i + 1 + (l + 1) * a_dim1];
1475 }
1476 if (uisec < isec)
1477 {
1478 t1[l - ll + 1 + (isec << 8) - 257] =
1479 a[ii + isec - 1 + l * a_dim1];
1480 t1[l - ll + 2 + (isec << 8) - 257] =
1481 a[ii + isec - 1 + (l + 1) * a_dim1];
1482 }
1483 }
1484 if (ulsec < lsec)
1485 {
1486 i4 = ii + isec - 1;
1487 for (i = ii; i<= i4; ++i)
1488 {
1489 t1[lsec + ((i - ii + 1) << 8) - 257] =
1490 a[i + (ll + lsec - 1) * a_dim1];
1491 }
1492 }
1493
1494 uisec = isec - isec % 4;
1495 i4 = jj + ujsec - 1;
1496 for (j = jj; j <= i4; j += 4)
1497 {
1498 i5 = ii + uisec - 1;
1499 for (i = ii; i <= i5; i += 4)
1500 {
1501 f11 = c[i + j * c_dim1];
1502 f21 = c[i + 1 + j * c_dim1];
1503 f12 = c[i + (j + 1) * c_dim1];
1504 f22 = c[i + 1 + (j + 1) * c_dim1];
1505 f13 = c[i + (j + 2) * c_dim1];
1506 f23 = c[i + 1 + (j + 2) * c_dim1];
1507 f14 = c[i + (j + 3) * c_dim1];
1508 f24 = c[i + 1 + (j + 3) * c_dim1];
1509 f31 = c[i + 2 + j * c_dim1];
1510 f41 = c[i + 3 + j * c_dim1];
1511 f32 = c[i + 2 + (j + 1) * c_dim1];
1512 f42 = c[i + 3 + (j + 1) * c_dim1];
1513 f33 = c[i + 2 + (j + 2) * c_dim1];
1514 f43 = c[i + 3 + (j + 2) * c_dim1];
1515 f34 = c[i + 2 + (j + 3) * c_dim1];
1516 f44 = c[i + 3 + (j + 3) * c_dim1];
1517 i6 = ll + lsec - 1;
1518 for (l = ll; l <= i6; ++l)
1519 {
1520 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1521 * b[l + j * b_dim1];
1522 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1523 * b[l + j * b_dim1];
1524 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1525 * b[l + (j + 1) * b_dim1];
1526 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1527 * b[l + (j + 1) * b_dim1];
1528 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1529 * b[l + (j + 2) * b_dim1];
1530 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1531 * b[l + (j + 2) * b_dim1];
1532 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
1533 * b[l + (j + 3) * b_dim1];
1534 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
1535 * b[l + (j + 3) * b_dim1];
1536 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1537 * b[l + j * b_dim1];
1538 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1539 * b[l + j * b_dim1];
1540 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1541 * b[l + (j + 1) * b_dim1];
1542 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1543 * b[l + (j + 1) * b_dim1];
1544 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1545 * b[l + (j + 2) * b_dim1];
1546 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1547 * b[l + (j + 2) * b_dim1];
1548 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
1549 * b[l + (j + 3) * b_dim1];
1550 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
1551 * b[l + (j + 3) * b_dim1];
1552 }
1553 c[i + j * c_dim1] = f11;
1554 c[i + 1 + j * c_dim1] = f21;
1555 c[i + (j + 1) * c_dim1] = f12;
1556 c[i + 1 + (j + 1) * c_dim1] = f22;
1557 c[i + (j + 2) * c_dim1] = f13;
1558 c[i + 1 + (j + 2) * c_dim1] = f23;
1559 c[i + (j + 3) * c_dim1] = f14;
1560 c[i + 1 + (j + 3) * c_dim1] = f24;
1561 c[i + 2 + j * c_dim1] = f31;
1562 c[i + 3 + j * c_dim1] = f41;
1563 c[i + 2 + (j + 1) * c_dim1] = f32;
1564 c[i + 3 + (j + 1) * c_dim1] = f42;
1565 c[i + 2 + (j + 2) * c_dim1] = f33;
1566 c[i + 3 + (j + 2) * c_dim1] = f43;
1567 c[i + 2 + (j + 3) * c_dim1] = f34;
1568 c[i + 3 + (j + 3) * c_dim1] = f44;
1569 }
1570 if (uisec < isec)
1571 {
1572 i5 = ii + isec - 1;
1573 for (i = ii + uisec; i <= i5; ++i)
1574 {
1575 f11 = c[i + j * c_dim1];
1576 f12 = c[i + (j + 1) * c_dim1];
1577 f13 = c[i + (j + 2) * c_dim1];
1578 f14 = c[i + (j + 3) * c_dim1];
1579 i6 = ll + lsec - 1;
1580 for (l = ll; l <= i6; ++l)
1581 {
1582 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1583 257] * b[l + j * b_dim1];
1584 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1585 257] * b[l + (j + 1) * b_dim1];
1586 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1587 257] * b[l + (j + 2) * b_dim1];
1588 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1589 257] * b[l + (j + 3) * b_dim1];
1590 }
1591 c[i + j * c_dim1] = f11;
1592 c[i + (j + 1) * c_dim1] = f12;
1593 c[i + (j + 2) * c_dim1] = f13;
1594 c[i + (j + 3) * c_dim1] = f14;
1595 }
1596 }
1597 }
1598 if (ujsec < jsec)
1599 {
1600 i4 = jj + jsec - 1;
1601 for (j = jj + ujsec; j <= i4; ++j)
1602 {
1603 i5 = ii + uisec - 1;
1604 for (i = ii; i <= i5; i += 4)
1605 {
1606 f11 = c[i + j * c_dim1];
1607 f21 = c[i + 1 + j * c_dim1];
1608 f31 = c[i + 2 + j * c_dim1];
1609 f41 = c[i + 3 + j * c_dim1];
1610 i6 = ll + lsec - 1;
1611 for (l = ll; l <= i6; ++l)
1612 {
1613 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1614 257] * b[l + j * b_dim1];
1615 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
1616 257] * b[l + j * b_dim1];
1617 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
1618 257] * b[l + j * b_dim1];
1619 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
1620 257] * b[l + j * b_dim1];
1621 }
1622 c[i + j * c_dim1] = f11;
1623 c[i + 1 + j * c_dim1] = f21;
1624 c[i + 2 + j * c_dim1] = f31;
1625 c[i + 3 + j * c_dim1] = f41;
1626 }
1627 i5 = ii + isec - 1;
1628 for (i = ii + uisec; i <= i5; ++i)
1629 {
1630 f11 = c[i + j * c_dim1];
1631 i6 = ll + lsec - 1;
1632 for (l = ll; l <= i6; ++l)
1633 {
1634 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
1635 257] * b[l + j * b_dim1];
1636 }
1637 c[i + j * c_dim1] = f11;
1638 }
1639 }
1640 }
1641 }
1642 }
1643 }
1644 return;
1645 }
1646 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
1647 {
1648 if (GFC_DESCRIPTOR_RANK (a) != 1)
1649 {
1650 const GFC_REAL_4 *restrict abase_x;
1651 const GFC_REAL_4 *restrict bbase_y;
1652 GFC_REAL_4 *restrict dest_y;
1653 GFC_REAL_4 s;
1654
1655 for (y = 0; y < ycount; y++)
1656 {
1657 bbase_y = &bbase[y*bystride];
1658 dest_y = &dest[y*rystride];
1659 for (x = 0; x < xcount; x++)
1660 {
1661 abase_x = &abase[x*axstride];
1662 s = (GFC_REAL_4) 0;
1663 for (n = 0; n < count; n++)
1664 s += abase_x[n] * bbase_y[n];
1665 dest_y[x] = s;
1666 }
1667 }
1668 }
1669 else
1670 {
1671 const GFC_REAL_4 *restrict bbase_y;
1672 GFC_REAL_4 s;
1673
1674 for (y = 0; y < ycount; y++)
1675 {
1676 bbase_y = &bbase[y*bystride];
1677 s = (GFC_REAL_4) 0;
1678 for (n = 0; n < count; n++)
1679 s += abase[n*axstride] * bbase_y[n];
1680 dest[y*rystride] = s;
1681 }
1682 }
1683 }
1684 else if (axstride < aystride)
1685 {
1686 for (y = 0; y < ycount; y++)
1687 for (x = 0; x < xcount; x++)
1688 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
1689
1690 for (y = 0; y < ycount; y++)
1691 for (n = 0; n < count; n++)
1692 for (x = 0; x < xcount; x++)
1693 /* dest[x,y] += a[x,n] * b[n,y] */
1694 dest[x*rxstride + y*rystride] +=
1695 abase[x*axstride + n*aystride] *
1696 bbase[n*bxstride + y*bystride];
1697 }
1698 else if (GFC_DESCRIPTOR_RANK (a) == 1)
1699 {
1700 const GFC_REAL_4 *restrict bbase_y;
1701 GFC_REAL_4 s;
1702
1703 for (y = 0; y < ycount; y++)
1704 {
1705 bbase_y = &bbase[y*bystride];
1706 s = (GFC_REAL_4) 0;
1707 for (n = 0; n < count; n++)
1708 s += abase[n*axstride] * bbase_y[n*bxstride];
1709 dest[y*rxstride] = s;
1710 }
1711 }
1712 else
1713 {
1714 const GFC_REAL_4 *restrict abase_x;
1715 const GFC_REAL_4 *restrict bbase_y;
1716 GFC_REAL_4 *restrict dest_y;
1717 GFC_REAL_4 s;
1718
1719 for (y = 0; y < ycount; y++)
1720 {
1721 bbase_y = &bbase[y*bystride];
1722 dest_y = &dest[y*rystride];
1723 for (x = 0; x < xcount; x++)
1724 {
1725 abase_x = &abase[x*axstride];
1726 s = (GFC_REAL_4) 0;
1727 for (n = 0; n < count; n++)
1728 s += abase_x[n*aystride] * bbase_y[n*bxstride];
1729 dest_y[x*rxstride] = s;
1730 }
1731 }
1732 }
1733 }
1734 #undef POW3
1735 #undef min
1736 #undef max
1737
1738 #endif /* HAVE_AVX512F */
1739
1740 /* Function to fall back to if there is no special processor-specific version. */
1741 static void
1742 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
1743 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
1744 int blas_limit, blas_call gemm)
1745 {
1746 const GFC_REAL_4 * restrict abase;
1747 const GFC_REAL_4 * restrict bbase;
1748 GFC_REAL_4 * restrict dest;
1749
1750 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
1751 index_type x, y, n, count, xcount, ycount;
1752
1753 assert (GFC_DESCRIPTOR_RANK (a) == 2
1754 || GFC_DESCRIPTOR_RANK (b) == 2);
1755
1756 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
1757
1758 Either A or B (but not both) can be rank 1:
1759
1760 o One-dimensional argument A is implicitly treated as a row matrix
1761 dimensioned [1,count], so xcount=1.
1762
1763 o One-dimensional argument B is implicitly treated as a column matrix
1764 dimensioned [count, 1], so ycount=1.
1765 */
1766
1767 if (retarray->base_addr == NULL)
1768 {
1769 if (GFC_DESCRIPTOR_RANK (a) == 1)
1770 {
1771 GFC_DIMENSION_SET(retarray->dim[0], 0,
1772 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
1773 }
1774 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1775 {
1776 GFC_DIMENSION_SET(retarray->dim[0], 0,
1777 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1778 }
1779 else
1780 {
1781 GFC_DIMENSION_SET(retarray->dim[0], 0,
1782 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
1783
1784 GFC_DIMENSION_SET(retarray->dim[1], 0,
1785 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
1786 GFC_DESCRIPTOR_EXTENT(retarray,0));
1787 }
1788
1789 retarray->base_addr
1790 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
1791 retarray->offset = 0;
1792 }
1793 else if (unlikely (compile_options.bounds_check))
1794 {
1795 index_type ret_extent, arg_extent;
1796
1797 if (GFC_DESCRIPTOR_RANK (a) == 1)
1798 {
1799 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1800 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1801 if (arg_extent != ret_extent)
1802 runtime_error ("Incorrect extent in return array in"
1803 " MATMUL intrinsic: is %ld, should be %ld",
1804 (long int) ret_extent, (long int) arg_extent);
1805 }
1806 else if (GFC_DESCRIPTOR_RANK (b) == 1)
1807 {
1808 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1809 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1810 if (arg_extent != ret_extent)
1811 runtime_error ("Incorrect extent in return array in"
1812 " MATMUL intrinsic: is %ld, should be %ld",
1813 (long int) ret_extent, (long int) arg_extent);
1814 }
1815 else
1816 {
1817 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
1818 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
1819 if (arg_extent != ret_extent)
1820 runtime_error ("Incorrect extent in return array in"
1821 " MATMUL intrinsic for dimension 1:"
1822 " is %ld, should be %ld",
1823 (long int) ret_extent, (long int) arg_extent);
1824
1825 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
1826 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
1827 if (arg_extent != ret_extent)
1828 runtime_error ("Incorrect extent in return array in"
1829 " MATMUL intrinsic for dimension 2:"
1830 " is %ld, should be %ld",
1831 (long int) ret_extent, (long int) arg_extent);
1832 }
1833 }
1834
1835
1836 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
1837 {
1838 /* One-dimensional result may be addressed in the code below
1839 either as a row or a column matrix. We want both cases to
1840 work. */
1841 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1842 }
1843 else
1844 {
1845 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
1846 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
1847 }
1848
1849
1850 if (GFC_DESCRIPTOR_RANK (a) == 1)
1851 {
1852 /* Treat it as a a row matrix A[1,count]. */
1853 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1854 aystride = 1;
1855
1856 xcount = 1;
1857 count = GFC_DESCRIPTOR_EXTENT(a,0);
1858 }
1859 else
1860 {
1861 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
1862 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
1863
1864 count = GFC_DESCRIPTOR_EXTENT(a,1);
1865 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
1866 }
1867
1868 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
1869 {
1870 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
1871 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
1872 }
1873
1874 if (GFC_DESCRIPTOR_RANK (b) == 1)
1875 {
1876 /* Treat it as a column matrix B[count,1] */
1877 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1878
1879 /* bystride should never be used for 1-dimensional b.
1880 in case it is we want it to cause a segfault, rather than
1881 an incorrect result. */
1882 bystride = 0xDEADBEEF;
1883 ycount = 1;
1884 }
1885 else
1886 {
1887 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
1888 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
1889 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
1890 }
1891
1892 abase = a->base_addr;
1893 bbase = b->base_addr;
1894 dest = retarray->base_addr;
1895
1896 /* Now that everything is set up, we perform the multiplication
1897 itself. */
1898
1899 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
1900 #define min(a,b) ((a) <= (b) ? (a) : (b))
1901 #define max(a,b) ((a) >= (b) ? (a) : (b))
1902
1903 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
1904 && (bxstride == 1 || bystride == 1)
1905 && (((float) xcount) * ((float) ycount) * ((float) count)
1906 > POW3(blas_limit)))
1907 {
1908 const int m = xcount, n = ycount, k = count, ldc = rystride;
1909 const GFC_REAL_4 one = 1, zero = 0;
1910 const int lda = (axstride == 1) ? aystride : axstride,
1911 ldb = (bxstride == 1) ? bystride : bxstride;
1912
1913 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
1914 {
1915 assert (gemm != NULL);
1916 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
1917 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
1918 &ldc, 1, 1);
1919 return;
1920 }
1921 }
1922
1923 if (rxstride == 1 && axstride == 1 && bxstride == 1)
1924 {
1925 /* This block of code implements a tuned matmul, derived from
1926 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
1927
1928 Bo Kagstrom and Per Ling
1929 Department of Computing Science
1930 Umea University
1931 S-901 87 Umea, Sweden
1932
1933 from netlib.org, translated to C, and modified for matmul.m4. */
1934
1935 const GFC_REAL_4 *a, *b;
1936 GFC_REAL_4 *c;
1937 const index_type m = xcount, n = ycount, k = count;
1938
1939 /* System generated locals */
1940 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
1941 i1, i2, i3, i4, i5, i6;
1942
1943 /* Local variables */
1944 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
1945 f13, f14, f23, f24, f33, f34, f43, f44;
1946 index_type i, j, l, ii, jj, ll;
1947 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
1948
1949 a = abase;
1950 b = bbase;
1951 c = retarray->base_addr;
1952
1953 /* Parameter adjustments */
1954 c_dim1 = rystride;
1955 c_offset = 1 + c_dim1;
1956 c -= c_offset;
1957 a_dim1 = aystride;
1958 a_offset = 1 + a_dim1;
1959 a -= a_offset;
1960 b_dim1 = bystride;
1961 b_offset = 1 + b_dim1;
1962 b -= b_offset;
1963
1964 /* Early exit if possible */
1965 if (m == 0 || n == 0 || k == 0)
1966 return;
1967
1968 /* Adjust size of t1 to what is needed. */
1969 index_type t1_dim;
1970 t1_dim = (a_dim1-1) * 256 + b_dim1;
1971 if (t1_dim > 65536)
1972 t1_dim = 65536;
1973
1974 #pragma GCC diagnostic push
1975 #pragma GCC diagnostic ignored "-Wvla"
1976 GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
1977 #pragma GCC diagnostic pop
1978
1979 /* Empty c first. */
1980 for (j=1; j<=n; j++)
1981 for (i=1; i<=m; i++)
1982 c[i + j * c_dim1] = (GFC_REAL_4)0;
1983
1984 /* Start turning the crank. */
1985 i1 = n;
1986 for (jj = 1; jj <= i1; jj += 512)
1987 {
1988 /* Computing MIN */
1989 i2 = 512;
1990 i3 = n - jj + 1;
1991 jsec = min(i2,i3);
1992 ujsec = jsec - jsec % 4;
1993 i2 = k;
1994 for (ll = 1; ll <= i2; ll += 256)
1995 {
1996 /* Computing MIN */
1997 i3 = 256;
1998 i4 = k - ll + 1;
1999 lsec = min(i3,i4);
2000 ulsec = lsec - lsec % 2;
2001
2002 i3 = m;
2003 for (ii = 1; ii <= i3; ii += 256)
2004 {
2005 /* Computing MIN */
2006 i4 = 256;
2007 i5 = m - ii + 1;
2008 isec = min(i4,i5);
2009 uisec = isec - isec % 2;
2010 i4 = ll + ulsec - 1;
2011 for (l = ll; l <= i4; l += 2)
2012 {
2013 i5 = ii + uisec - 1;
2014 for (i = ii; i <= i5; i += 2)
2015 {
2016 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2017 a[i + l * a_dim1];
2018 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2019 a[i + (l + 1) * a_dim1];
2020 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2021 a[i + 1 + l * a_dim1];
2022 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2023 a[i + 1 + (l + 1) * a_dim1];
2024 }
2025 if (uisec < isec)
2026 {
2027 t1[l - ll + 1 + (isec << 8) - 257] =
2028 a[ii + isec - 1 + l * a_dim1];
2029 t1[l - ll + 2 + (isec << 8) - 257] =
2030 a[ii + isec - 1 + (l + 1) * a_dim1];
2031 }
2032 }
2033 if (ulsec < lsec)
2034 {
2035 i4 = ii + isec - 1;
2036 for (i = ii; i<= i4; ++i)
2037 {
2038 t1[lsec + ((i - ii + 1) << 8) - 257] =
2039 a[i + (ll + lsec - 1) * a_dim1];
2040 }
2041 }
2042
2043 uisec = isec - isec % 4;
2044 i4 = jj + ujsec - 1;
2045 for (j = jj; j <= i4; j += 4)
2046 {
2047 i5 = ii + uisec - 1;
2048 for (i = ii; i <= i5; i += 4)
2049 {
2050 f11 = c[i + j * c_dim1];
2051 f21 = c[i + 1 + j * c_dim1];
2052 f12 = c[i + (j + 1) * c_dim1];
2053 f22 = c[i + 1 + (j + 1) * c_dim1];
2054 f13 = c[i + (j + 2) * c_dim1];
2055 f23 = c[i + 1 + (j + 2) * c_dim1];
2056 f14 = c[i + (j + 3) * c_dim1];
2057 f24 = c[i + 1 + (j + 3) * c_dim1];
2058 f31 = c[i + 2 + j * c_dim1];
2059 f41 = c[i + 3 + j * c_dim1];
2060 f32 = c[i + 2 + (j + 1) * c_dim1];
2061 f42 = c[i + 3 + (j + 1) * c_dim1];
2062 f33 = c[i + 2 + (j + 2) * c_dim1];
2063 f43 = c[i + 3 + (j + 2) * c_dim1];
2064 f34 = c[i + 2 + (j + 3) * c_dim1];
2065 f44 = c[i + 3 + (j + 3) * c_dim1];
2066 i6 = ll + lsec - 1;
2067 for (l = ll; l <= i6; ++l)
2068 {
2069 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2070 * b[l + j * b_dim1];
2071 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2072 * b[l + j * b_dim1];
2073 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2074 * b[l + (j + 1) * b_dim1];
2075 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2076 * b[l + (j + 1) * b_dim1];
2077 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2078 * b[l + (j + 2) * b_dim1];
2079 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2080 * b[l + (j + 2) * b_dim1];
2081 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2082 * b[l + (j + 3) * b_dim1];
2083 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2084 * b[l + (j + 3) * b_dim1];
2085 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2086 * b[l + j * b_dim1];
2087 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2088 * b[l + j * b_dim1];
2089 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2090 * b[l + (j + 1) * b_dim1];
2091 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2092 * b[l + (j + 1) * b_dim1];
2093 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2094 * b[l + (j + 2) * b_dim1];
2095 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2096 * b[l + (j + 2) * b_dim1];
2097 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2098 * b[l + (j + 3) * b_dim1];
2099 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2100 * b[l + (j + 3) * b_dim1];
2101 }
2102 c[i + j * c_dim1] = f11;
2103 c[i + 1 + j * c_dim1] = f21;
2104 c[i + (j + 1) * c_dim1] = f12;
2105 c[i + 1 + (j + 1) * c_dim1] = f22;
2106 c[i + (j + 2) * c_dim1] = f13;
2107 c[i + 1 + (j + 2) * c_dim1] = f23;
2108 c[i + (j + 3) * c_dim1] = f14;
2109 c[i + 1 + (j + 3) * c_dim1] = f24;
2110 c[i + 2 + j * c_dim1] = f31;
2111 c[i + 3 + j * c_dim1] = f41;
2112 c[i + 2 + (j + 1) * c_dim1] = f32;
2113 c[i + 3 + (j + 1) * c_dim1] = f42;
2114 c[i + 2 + (j + 2) * c_dim1] = f33;
2115 c[i + 3 + (j + 2) * c_dim1] = f43;
2116 c[i + 2 + (j + 3) * c_dim1] = f34;
2117 c[i + 3 + (j + 3) * c_dim1] = f44;
2118 }
2119 if (uisec < isec)
2120 {
2121 i5 = ii + isec - 1;
2122 for (i = ii + uisec; i <= i5; ++i)
2123 {
2124 f11 = c[i + j * c_dim1];
2125 f12 = c[i + (j + 1) * c_dim1];
2126 f13 = c[i + (j + 2) * c_dim1];
2127 f14 = c[i + (j + 3) * c_dim1];
2128 i6 = ll + lsec - 1;
2129 for (l = ll; l <= i6; ++l)
2130 {
2131 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2132 257] * b[l + j * b_dim1];
2133 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2134 257] * b[l + (j + 1) * b_dim1];
2135 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2136 257] * b[l + (j + 2) * b_dim1];
2137 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2138 257] * b[l + (j + 3) * b_dim1];
2139 }
2140 c[i + j * c_dim1] = f11;
2141 c[i + (j + 1) * c_dim1] = f12;
2142 c[i + (j + 2) * c_dim1] = f13;
2143 c[i + (j + 3) * c_dim1] = f14;
2144 }
2145 }
2146 }
2147 if (ujsec < jsec)
2148 {
2149 i4 = jj + jsec - 1;
2150 for (j = jj + ujsec; j <= i4; ++j)
2151 {
2152 i5 = ii + uisec - 1;
2153 for (i = ii; i <= i5; i += 4)
2154 {
2155 f11 = c[i + j * c_dim1];
2156 f21 = c[i + 1 + j * c_dim1];
2157 f31 = c[i + 2 + j * c_dim1];
2158 f41 = c[i + 3 + j * c_dim1];
2159 i6 = ll + lsec - 1;
2160 for (l = ll; l <= i6; ++l)
2161 {
2162 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2163 257] * b[l + j * b_dim1];
2164 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2165 257] * b[l + j * b_dim1];
2166 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2167 257] * b[l + j * b_dim1];
2168 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2169 257] * b[l + j * b_dim1];
2170 }
2171 c[i + j * c_dim1] = f11;
2172 c[i + 1 + j * c_dim1] = f21;
2173 c[i + 2 + j * c_dim1] = f31;
2174 c[i + 3 + j * c_dim1] = f41;
2175 }
2176 i5 = ii + isec - 1;
2177 for (i = ii + uisec; i <= i5; ++i)
2178 {
2179 f11 = c[i + j * c_dim1];
2180 i6 = ll + lsec - 1;
2181 for (l = ll; l <= i6; ++l)
2182 {
2183 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2184 257] * b[l + j * b_dim1];
2185 }
2186 c[i + j * c_dim1] = f11;
2187 }
2188 }
2189 }
2190 }
2191 }
2192 }
2193 return;
2194 }
2195 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2196 {
2197 if (GFC_DESCRIPTOR_RANK (a) != 1)
2198 {
2199 const GFC_REAL_4 *restrict abase_x;
2200 const GFC_REAL_4 *restrict bbase_y;
2201 GFC_REAL_4 *restrict dest_y;
2202 GFC_REAL_4 s;
2203
2204 for (y = 0; y < ycount; y++)
2205 {
2206 bbase_y = &bbase[y*bystride];
2207 dest_y = &dest[y*rystride];
2208 for (x = 0; x < xcount; x++)
2209 {
2210 abase_x = &abase[x*axstride];
2211 s = (GFC_REAL_4) 0;
2212 for (n = 0; n < count; n++)
2213 s += abase_x[n] * bbase_y[n];
2214 dest_y[x] = s;
2215 }
2216 }
2217 }
2218 else
2219 {
2220 const GFC_REAL_4 *restrict bbase_y;
2221 GFC_REAL_4 s;
2222
2223 for (y = 0; y < ycount; y++)
2224 {
2225 bbase_y = &bbase[y*bystride];
2226 s = (GFC_REAL_4) 0;
2227 for (n = 0; n < count; n++)
2228 s += abase[n*axstride] * bbase_y[n];
2229 dest[y*rystride] = s;
2230 }
2231 }
2232 }
2233 else if (axstride < aystride)
2234 {
2235 for (y = 0; y < ycount; y++)
2236 for (x = 0; x < xcount; x++)
2237 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2238
2239 for (y = 0; y < ycount; y++)
2240 for (n = 0; n < count; n++)
2241 for (x = 0; x < xcount; x++)
2242 /* dest[x,y] += a[x,n] * b[n,y] */
2243 dest[x*rxstride + y*rystride] +=
2244 abase[x*axstride + n*aystride] *
2245 bbase[n*bxstride + y*bystride];
2246 }
2247 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2248 {
2249 const GFC_REAL_4 *restrict bbase_y;
2250 GFC_REAL_4 s;
2251
2252 for (y = 0; y < ycount; y++)
2253 {
2254 bbase_y = &bbase[y*bystride];
2255 s = (GFC_REAL_4) 0;
2256 for (n = 0; n < count; n++)
2257 s += abase[n*axstride] * bbase_y[n*bxstride];
2258 dest[y*rxstride] = s;
2259 }
2260 }
2261 else
2262 {
2263 const GFC_REAL_4 *restrict abase_x;
2264 const GFC_REAL_4 *restrict bbase_y;
2265 GFC_REAL_4 *restrict dest_y;
2266 GFC_REAL_4 s;
2267
2268 for (y = 0; y < ycount; y++)
2269 {
2270 bbase_y = &bbase[y*bystride];
2271 dest_y = &dest[y*rystride];
2272 for (x = 0; x < xcount; x++)
2273 {
2274 abase_x = &abase[x*axstride];
2275 s = (GFC_REAL_4) 0;
2276 for (n = 0; n < count; n++)
2277 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2278 dest_y[x*rxstride] = s;
2279 }
2280 }
2281 }
2282 }
2283 #undef POW3
2284 #undef min
2285 #undef max
2286
2287
2288 /* Compiling main function, with selection code for the processor. */
2289
2290 /* Currently, this is i386 only. Adjust for other architectures. */
2291
2292 #include <config/i386/cpuinfo.h>
2293 void matmul_r4 (gfc_array_r4 * const restrict retarray,
2294 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2295 int blas_limit, blas_call gemm)
2296 {
2297 static void (*matmul_p) (gfc_array_r4 * const restrict retarray,
2298 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2299 int blas_limit, blas_call gemm);
2300
2301 void (*matmul_fn) (gfc_array_r4 * const restrict retarray,
2302 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2303 int blas_limit, blas_call gemm);
2304
2305 matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
2306 if (matmul_fn == NULL)
2307 {
2308 matmul_fn = matmul_r4_vanilla;
2309 if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
2310 {
2311 /* Run down the available processors in order of preference. */
2312 #ifdef HAVE_AVX512F
2313 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
2314 {
2315 matmul_fn = matmul_r4_avx512f;
2316 goto store;
2317 }
2318
2319 #endif /* HAVE_AVX512F */
2320
2321 #ifdef HAVE_AVX2
2322 if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
2323 && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
2324 {
2325 matmul_fn = matmul_r4_avx2;
2326 goto store;
2327 }
2328
2329 #endif
2330
2331 #ifdef HAVE_AVX
2332 if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
2333 {
2334 matmul_fn = matmul_r4_avx;
2335 goto store;
2336 }
2337 #endif /* HAVE_AVX */
2338 }
2339 store:
2340 __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
2341 }
2342
2343 (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
2344 }
2345
2346 #else /* Just the vanilla function. */
2347
2348 void
2349 matmul_r4 (gfc_array_r4 * const restrict retarray,
2350 gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
2351 int blas_limit, blas_call gemm)
2352 {
2353 const GFC_REAL_4 * restrict abase;
2354 const GFC_REAL_4 * restrict bbase;
2355 GFC_REAL_4 * restrict dest;
2356
2357 index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
2358 index_type x, y, n, count, xcount, ycount;
2359
2360 assert (GFC_DESCRIPTOR_RANK (a) == 2
2361 || GFC_DESCRIPTOR_RANK (b) == 2);
2362
2363 /* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
2364
2365 Either A or B (but not both) can be rank 1:
2366
2367 o One-dimensional argument A is implicitly treated as a row matrix
2368 dimensioned [1,count], so xcount=1.
2369
2370 o One-dimensional argument B is implicitly treated as a column matrix
2371 dimensioned [count, 1], so ycount=1.
2372 */
2373
2374 if (retarray->base_addr == NULL)
2375 {
2376 if (GFC_DESCRIPTOR_RANK (a) == 1)
2377 {
2378 GFC_DIMENSION_SET(retarray->dim[0], 0,
2379 GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
2380 }
2381 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2382 {
2383 GFC_DIMENSION_SET(retarray->dim[0], 0,
2384 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2385 }
2386 else
2387 {
2388 GFC_DIMENSION_SET(retarray->dim[0], 0,
2389 GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
2390
2391 GFC_DIMENSION_SET(retarray->dim[1], 0,
2392 GFC_DESCRIPTOR_EXTENT(b,1) - 1,
2393 GFC_DESCRIPTOR_EXTENT(retarray,0));
2394 }
2395
2396 retarray->base_addr
2397 = xmallocarray (size0 ((array_t *) retarray), sizeof (GFC_REAL_4));
2398 retarray->offset = 0;
2399 }
2400 else if (unlikely (compile_options.bounds_check))
2401 {
2402 index_type ret_extent, arg_extent;
2403
2404 if (GFC_DESCRIPTOR_RANK (a) == 1)
2405 {
2406 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2407 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2408 if (arg_extent != ret_extent)
2409 runtime_error ("Incorrect extent in return array in"
2410 " MATMUL intrinsic: is %ld, should be %ld",
2411 (long int) ret_extent, (long int) arg_extent);
2412 }
2413 else if (GFC_DESCRIPTOR_RANK (b) == 1)
2414 {
2415 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2416 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2417 if (arg_extent != ret_extent)
2418 runtime_error ("Incorrect extent in return array in"
2419 " MATMUL intrinsic: is %ld, should be %ld",
2420 (long int) ret_extent, (long int) arg_extent);
2421 }
2422 else
2423 {
2424 arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
2425 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
2426 if (arg_extent != ret_extent)
2427 runtime_error ("Incorrect extent in return array in"
2428 " MATMUL intrinsic for dimension 1:"
2429 " is %ld, should be %ld",
2430 (long int) ret_extent, (long int) arg_extent);
2431
2432 arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
2433 ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
2434 if (arg_extent != ret_extent)
2435 runtime_error ("Incorrect extent in return array in"
2436 " MATMUL intrinsic for dimension 2:"
2437 " is %ld, should be %ld",
2438 (long int) ret_extent, (long int) arg_extent);
2439 }
2440 }
2441
2442
2443 if (GFC_DESCRIPTOR_RANK (retarray) == 1)
2444 {
2445 /* One-dimensional result may be addressed in the code below
2446 either as a row or a column matrix. We want both cases to
2447 work. */
2448 rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2449 }
2450 else
2451 {
2452 rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
2453 rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
2454 }
2455
2456
2457 if (GFC_DESCRIPTOR_RANK (a) == 1)
2458 {
2459 /* Treat it as a a row matrix A[1,count]. */
2460 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2461 aystride = 1;
2462
2463 xcount = 1;
2464 count = GFC_DESCRIPTOR_EXTENT(a,0);
2465 }
2466 else
2467 {
2468 axstride = GFC_DESCRIPTOR_STRIDE(a,0);
2469 aystride = GFC_DESCRIPTOR_STRIDE(a,1);
2470
2471 count = GFC_DESCRIPTOR_EXTENT(a,1);
2472 xcount = GFC_DESCRIPTOR_EXTENT(a,0);
2473 }
2474
2475 if (count != GFC_DESCRIPTOR_EXTENT(b,0))
2476 {
2477 if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
2478 runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
2479 }
2480
2481 if (GFC_DESCRIPTOR_RANK (b) == 1)
2482 {
2483 /* Treat it as a column matrix B[count,1] */
2484 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2485
2486 /* bystride should never be used for 1-dimensional b.
2487 in case it is we want it to cause a segfault, rather than
2488 an incorrect result. */
2489 bystride = 0xDEADBEEF;
2490 ycount = 1;
2491 }
2492 else
2493 {
2494 bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
2495 bystride = GFC_DESCRIPTOR_STRIDE(b,1);
2496 ycount = GFC_DESCRIPTOR_EXTENT(b,1);
2497 }
2498
2499 abase = a->base_addr;
2500 bbase = b->base_addr;
2501 dest = retarray->base_addr;
2502
2503 /* Now that everything is set up, we perform the multiplication
2504 itself. */
2505
2506 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
2507 #define min(a,b) ((a) <= (b) ? (a) : (b))
2508 #define max(a,b) ((a) >= (b) ? (a) : (b))
2509
2510 if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
2511 && (bxstride == 1 || bystride == 1)
2512 && (((float) xcount) * ((float) ycount) * ((float) count)
2513 > POW3(blas_limit)))
2514 {
2515 const int m = xcount, n = ycount, k = count, ldc = rystride;
2516 const GFC_REAL_4 one = 1, zero = 0;
2517 const int lda = (axstride == 1) ? aystride : axstride,
2518 ldb = (bxstride == 1) ? bystride : bxstride;
2519
2520 if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
2521 {
2522 assert (gemm != NULL);
2523 gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
2524 &n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
2525 &ldc, 1, 1);
2526 return;
2527 }
2528 }
2529
2530 if (rxstride == 1 && axstride == 1 && bxstride == 1)
2531 {
2532 /* This block of code implements a tuned matmul, derived from
2533 Superscalar GEMM-based level 3 BLAS, Beta version 0.1
2534
2535 Bo Kagstrom and Per Ling
2536 Department of Computing Science
2537 Umea University
2538 S-901 87 Umea, Sweden
2539
2540 from netlib.org, translated to C, and modified for matmul.m4. */
2541
2542 const GFC_REAL_4 *a, *b;
2543 GFC_REAL_4 *c;
2544 const index_type m = xcount, n = ycount, k = count;
2545
2546 /* System generated locals */
2547 index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
2548 i1, i2, i3, i4, i5, i6;
2549
2550 /* Local variables */
2551 GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
2552 f13, f14, f23, f24, f33, f34, f43, f44;
2553 index_type i, j, l, ii, jj, ll;
2554 index_type isec, jsec, lsec, uisec, ujsec, ulsec;
2555
2556 a = abase;
2557 b = bbase;
2558 c = retarray->base_addr;
2559
2560 /* Parameter adjustments */
2561 c_dim1 = rystride;
2562 c_offset = 1 + c_dim1;
2563 c -= c_offset;
2564 a_dim1 = aystride;
2565 a_offset = 1 + a_dim1;
2566 a -= a_offset;
2567 b_dim1 = bystride;
2568 b_offset = 1 + b_dim1;
2569 b -= b_offset;
2570
2571 /* Early exit if possible */
2572 if (m == 0 || n == 0 || k == 0)
2573 return;
2574
2575 /* Adjust size of t1 to what is needed. */
2576 index_type t1_dim;
2577 t1_dim = (a_dim1-1) * 256 + b_dim1;
2578 if (t1_dim > 65536)
2579 t1_dim = 65536;
2580
2581 #pragma GCC diagnostic push
2582 #pragma GCC diagnostic ignored "-Wvla"
2583 GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
2584 #pragma GCC diagnostic pop
2585
2586 /* Empty c first. */
2587 for (j=1; j<=n; j++)
2588 for (i=1; i<=m; i++)
2589 c[i + j * c_dim1] = (GFC_REAL_4)0;
2590
2591 /* Start turning the crank. */
2592 i1 = n;
2593 for (jj = 1; jj <= i1; jj += 512)
2594 {
2595 /* Computing MIN */
2596 i2 = 512;
2597 i3 = n - jj + 1;
2598 jsec = min(i2,i3);
2599 ujsec = jsec - jsec % 4;
2600 i2 = k;
2601 for (ll = 1; ll <= i2; ll += 256)
2602 {
2603 /* Computing MIN */
2604 i3 = 256;
2605 i4 = k - ll + 1;
2606 lsec = min(i3,i4);
2607 ulsec = lsec - lsec % 2;
2608
2609 i3 = m;
2610 for (ii = 1; ii <= i3; ii += 256)
2611 {
2612 /* Computing MIN */
2613 i4 = 256;
2614 i5 = m - ii + 1;
2615 isec = min(i4,i5);
2616 uisec = isec - isec % 2;
2617 i4 = ll + ulsec - 1;
2618 for (l = ll; l <= i4; l += 2)
2619 {
2620 i5 = ii + uisec - 1;
2621 for (i = ii; i <= i5; i += 2)
2622 {
2623 t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
2624 a[i + l * a_dim1];
2625 t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
2626 a[i + (l + 1) * a_dim1];
2627 t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
2628 a[i + 1 + l * a_dim1];
2629 t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
2630 a[i + 1 + (l + 1) * a_dim1];
2631 }
2632 if (uisec < isec)
2633 {
2634 t1[l - ll + 1 + (isec << 8) - 257] =
2635 a[ii + isec - 1 + l * a_dim1];
2636 t1[l - ll + 2 + (isec << 8) - 257] =
2637 a[ii + isec - 1 + (l + 1) * a_dim1];
2638 }
2639 }
2640 if (ulsec < lsec)
2641 {
2642 i4 = ii + isec - 1;
2643 for (i = ii; i<= i4; ++i)
2644 {
2645 t1[lsec + ((i - ii + 1) << 8) - 257] =
2646 a[i + (ll + lsec - 1) * a_dim1];
2647 }
2648 }
2649
2650 uisec = isec - isec % 4;
2651 i4 = jj + ujsec - 1;
2652 for (j = jj; j <= i4; j += 4)
2653 {
2654 i5 = ii + uisec - 1;
2655 for (i = ii; i <= i5; i += 4)
2656 {
2657 f11 = c[i + j * c_dim1];
2658 f21 = c[i + 1 + j * c_dim1];
2659 f12 = c[i + (j + 1) * c_dim1];
2660 f22 = c[i + 1 + (j + 1) * c_dim1];
2661 f13 = c[i + (j + 2) * c_dim1];
2662 f23 = c[i + 1 + (j + 2) * c_dim1];
2663 f14 = c[i + (j + 3) * c_dim1];
2664 f24 = c[i + 1 + (j + 3) * c_dim1];
2665 f31 = c[i + 2 + j * c_dim1];
2666 f41 = c[i + 3 + j * c_dim1];
2667 f32 = c[i + 2 + (j + 1) * c_dim1];
2668 f42 = c[i + 3 + (j + 1) * c_dim1];
2669 f33 = c[i + 2 + (j + 2) * c_dim1];
2670 f43 = c[i + 3 + (j + 2) * c_dim1];
2671 f34 = c[i + 2 + (j + 3) * c_dim1];
2672 f44 = c[i + 3 + (j + 3) * c_dim1];
2673 i6 = ll + lsec - 1;
2674 for (l = ll; l <= i6; ++l)
2675 {
2676 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2677 * b[l + j * b_dim1];
2678 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2679 * b[l + j * b_dim1];
2680 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2681 * b[l + (j + 1) * b_dim1];
2682 f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2683 * b[l + (j + 1) * b_dim1];
2684 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2685 * b[l + (j + 2) * b_dim1];
2686 f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2687 * b[l + (j + 2) * b_dim1];
2688 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
2689 * b[l + (j + 3) * b_dim1];
2690 f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
2691 * b[l + (j + 3) * b_dim1];
2692 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2693 * b[l + j * b_dim1];
2694 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2695 * b[l + j * b_dim1];
2696 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2697 * b[l + (j + 1) * b_dim1];
2698 f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2699 * b[l + (j + 1) * b_dim1];
2700 f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2701 * b[l + (j + 2) * b_dim1];
2702 f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2703 * b[l + (j + 2) * b_dim1];
2704 f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
2705 * b[l + (j + 3) * b_dim1];
2706 f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
2707 * b[l + (j + 3) * b_dim1];
2708 }
2709 c[i + j * c_dim1] = f11;
2710 c[i + 1 + j * c_dim1] = f21;
2711 c[i + (j + 1) * c_dim1] = f12;
2712 c[i + 1 + (j + 1) * c_dim1] = f22;
2713 c[i + (j + 2) * c_dim1] = f13;
2714 c[i + 1 + (j + 2) * c_dim1] = f23;
2715 c[i + (j + 3) * c_dim1] = f14;
2716 c[i + 1 + (j + 3) * c_dim1] = f24;
2717 c[i + 2 + j * c_dim1] = f31;
2718 c[i + 3 + j * c_dim1] = f41;
2719 c[i + 2 + (j + 1) * c_dim1] = f32;
2720 c[i + 3 + (j + 1) * c_dim1] = f42;
2721 c[i + 2 + (j + 2) * c_dim1] = f33;
2722 c[i + 3 + (j + 2) * c_dim1] = f43;
2723 c[i + 2 + (j + 3) * c_dim1] = f34;
2724 c[i + 3 + (j + 3) * c_dim1] = f44;
2725 }
2726 if (uisec < isec)
2727 {
2728 i5 = ii + isec - 1;
2729 for (i = ii + uisec; i <= i5; ++i)
2730 {
2731 f11 = c[i + j * c_dim1];
2732 f12 = c[i + (j + 1) * c_dim1];
2733 f13 = c[i + (j + 2) * c_dim1];
2734 f14 = c[i + (j + 3) * c_dim1];
2735 i6 = ll + lsec - 1;
2736 for (l = ll; l <= i6; ++l)
2737 {
2738 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2739 257] * b[l + j * b_dim1];
2740 f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2741 257] * b[l + (j + 1) * b_dim1];
2742 f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2743 257] * b[l + (j + 2) * b_dim1];
2744 f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2745 257] * b[l + (j + 3) * b_dim1];
2746 }
2747 c[i + j * c_dim1] = f11;
2748 c[i + (j + 1) * c_dim1] = f12;
2749 c[i + (j + 2) * c_dim1] = f13;
2750 c[i + (j + 3) * c_dim1] = f14;
2751 }
2752 }
2753 }
2754 if (ujsec < jsec)
2755 {
2756 i4 = jj + jsec - 1;
2757 for (j = jj + ujsec; j <= i4; ++j)
2758 {
2759 i5 = ii + uisec - 1;
2760 for (i = ii; i <= i5; i += 4)
2761 {
2762 f11 = c[i + j * c_dim1];
2763 f21 = c[i + 1 + j * c_dim1];
2764 f31 = c[i + 2 + j * c_dim1];
2765 f41 = c[i + 3 + j * c_dim1];
2766 i6 = ll + lsec - 1;
2767 for (l = ll; l <= i6; ++l)
2768 {
2769 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2770 257] * b[l + j * b_dim1];
2771 f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
2772 257] * b[l + j * b_dim1];
2773 f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
2774 257] * b[l + j * b_dim1];
2775 f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
2776 257] * b[l + j * b_dim1];
2777 }
2778 c[i + j * c_dim1] = f11;
2779 c[i + 1 + j * c_dim1] = f21;
2780 c[i + 2 + j * c_dim1] = f31;
2781 c[i + 3 + j * c_dim1] = f41;
2782 }
2783 i5 = ii + isec - 1;
2784 for (i = ii + uisec; i <= i5; ++i)
2785 {
2786 f11 = c[i + j * c_dim1];
2787 i6 = ll + lsec - 1;
2788 for (l = ll; l <= i6; ++l)
2789 {
2790 f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
2791 257] * b[l + j * b_dim1];
2792 }
2793 c[i + j * c_dim1] = f11;
2794 }
2795 }
2796 }
2797 }
2798 }
2799 }
2800 return;
2801 }
2802 else if (rxstride == 1 && aystride == 1 && bxstride == 1)
2803 {
2804 if (GFC_DESCRIPTOR_RANK (a) != 1)
2805 {
2806 const GFC_REAL_4 *restrict abase_x;
2807 const GFC_REAL_4 *restrict bbase_y;
2808 GFC_REAL_4 *restrict dest_y;
2809 GFC_REAL_4 s;
2810
2811 for (y = 0; y < ycount; y++)
2812 {
2813 bbase_y = &bbase[y*bystride];
2814 dest_y = &dest[y*rystride];
2815 for (x = 0; x < xcount; x++)
2816 {
2817 abase_x = &abase[x*axstride];
2818 s = (GFC_REAL_4) 0;
2819 for (n = 0; n < count; n++)
2820 s += abase_x[n] * bbase_y[n];
2821 dest_y[x] = s;
2822 }
2823 }
2824 }
2825 else
2826 {
2827 const GFC_REAL_4 *restrict bbase_y;
2828 GFC_REAL_4 s;
2829
2830 for (y = 0; y < ycount; y++)
2831 {
2832 bbase_y = &bbase[y*bystride];
2833 s = (GFC_REAL_4) 0;
2834 for (n = 0; n < count; n++)
2835 s += abase[n*axstride] * bbase_y[n];
2836 dest[y*rystride] = s;
2837 }
2838 }
2839 }
2840 else if (axstride < aystride)
2841 {
2842 for (y = 0; y < ycount; y++)
2843 for (x = 0; x < xcount; x++)
2844 dest[x*rxstride + y*rystride] = (GFC_REAL_4)0;
2845
2846 for (y = 0; y < ycount; y++)
2847 for (n = 0; n < count; n++)
2848 for (x = 0; x < xcount; x++)
2849 /* dest[x,y] += a[x,n] * b[n,y] */
2850 dest[x*rxstride + y*rystride] +=
2851 abase[x*axstride + n*aystride] *
2852 bbase[n*bxstride + y*bystride];
2853 }
2854 else if (GFC_DESCRIPTOR_RANK (a) == 1)
2855 {
2856 const GFC_REAL_4 *restrict bbase_y;
2857 GFC_REAL_4 s;
2858
2859 for (y = 0; y < ycount; y++)
2860 {
2861 bbase_y = &bbase[y*bystride];
2862 s = (GFC_REAL_4) 0;
2863 for (n = 0; n < count; n++)
2864 s += abase[n*axstride] * bbase_y[n*bxstride];
2865 dest[y*rxstride] = s;
2866 }
2867 }
2868 else
2869 {
2870 const GFC_REAL_4 *restrict abase_x;
2871 const GFC_REAL_4 *restrict bbase_y;
2872 GFC_REAL_4 *restrict dest_y;
2873 GFC_REAL_4 s;
2874
2875 for (y = 0; y < ycount; y++)
2876 {
2877 bbase_y = &bbase[y*bystride];
2878 dest_y = &dest[y*rystride];
2879 for (x = 0; x < xcount; x++)
2880 {
2881 abase_x = &abase[x*axstride];
2882 s = (GFC_REAL_4) 0;
2883 for (n = 0; n < count; n++)
2884 s += abase_x[n*aystride] * bbase_y[n*bxstride];
2885 dest_y[x*rxstride] = s;
2886 }
2887 }
2888 }
2889 }
2890 #undef POW3
2891 #undef min
2892 #undef max
2893
2894 #endif
2895 #endif
2896